diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359f03da6..760c7087f677a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -338,8 +338,8 @@ class WaitcntBrackets { const MachineOperand &Op) const; bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, bool OptNone) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count, bool OptNone) const; void determineWait(InstCounterType T, RegInterval Interval, AMDGPU::Waitcnt &Wait) const; @@ -1164,22 +1164,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const { /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. -void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { - simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); - simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); - simplifyWaitcnt(DS_CNT, Wait.DsCnt); - simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); - simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); - simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); - simplifyWaitcnt(KM_CNT, Wait.KmCnt); - simplifyWaitcnt(X_CNT, Wait.XCnt); +void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait, + bool OptNone) const { + simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt, OptNone); + simplifyWaitcnt(EXP_CNT, Wait.ExpCnt, OptNone); + simplifyWaitcnt(DS_CNT, Wait.DsCnt, OptNone); + simplifyWaitcnt(STORE_CNT, Wait.StoreCnt, OptNone); + simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt, OptNone); + simplifyWaitcnt(BVH_CNT, Wait.BvhCnt, OptNone); + simplifyWaitcnt(KM_CNT, Wait.KmCnt, OptNone); + simplifyWaitcnt(X_CNT, Wait.XCnt, OptNone); } -void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, - unsigned &Count) const { +void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count, + bool OptNone) const { // The number of outstanding events for this type, T, can be calculated // as (UB - LB). If the current Count is greater than or equal to the number // of outstanding events, then the wait for this counter is redundant. + // + // For counts that are at max value or above, try this even when optimizations + // are disabled. This helps remove max waitcnt's that are inserted by the + // memory legalizer by default, but does not optimize actual waitcnt's that + // are otherwise inserted by the memory legalizer or a previous pass of the + // inserter. The corner case is when a max waitcnt was optimized away although + // it was not just a default, but was deliberately chosen. This only + // marginally affects the usefulness of OptNone. + if (Count < getWaitCountMax(T) && OptNone) + return; if (Count >= getScoreRange(T)) Count = ~0u; } @@ -1363,19 +1374,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( } unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); - bool TrySimplify = Opcode != II.getOpcode() && !OptNone; + bool OpcodeIsSoft = Opcode != II.getOpcode(); // Update required wait count. If this is a soft waitcnt (= it was added // by an earlier pass), it may be entirely removed. if (Opcode == AMDGPU::S_WAITCNT) { unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(OldWait); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait, OptNone); Wait = Wait.combined(OldWait); // Merge consecutive waitcnt of the same type by erasing multiples. - if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { + if (WaitcntInstr || + (!Wait.hasWaitExceptStoreCnt() && OpcodeIsSoft && !OptNone)) { II.eraseFromParent(); Modified = true; } else @@ -1386,11 +1398,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( unsigned OldVSCnt = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt, + OptNone); Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); - if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) { + if (WaitcntVsCntInstr || + (!Wait.hasWaitStoreCnt() && OpcodeIsSoft && !OptNone)) { II.eraseFromParent(); Modified = true; } else @@ -1528,7 +1542,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( // by an earlier pass), it may be entirely removed. unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); - bool TrySimplify = Opcode != II.getOpcode() && !OptNone; + bool OpcodeIsSoft = Opcode != II.getOpcode(); // Don't crash if the programmer used legacy waitcnt intrinsics, but don't // attempt to do more than that either. @@ -1539,16 +1553,16 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( unsigned OldEnc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(OldWait); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait, OptNone); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedLoadDsCntInstr; } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { unsigned OldEnc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(OldWait); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait, OptNone); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; } else { @@ -1556,8 +1570,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( assert(CT.has_value()); unsigned OldCnt = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt, OptNone); addWait(Wait, CT.value(), OldCnt); UpdatableInstr = &WaitInstrs[CT.value()]; } @@ -2009,7 +2023,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, } // Verify that the wait is actually needed. - ScoreBrackets.simplifyWaitcnt(Wait); + ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false); // When forcing emit, we need to skip terminators because that would break the // terminators of the MBB if we emit a waitcnt between terminators. @@ -2238,7 +2252,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst, NeedsEndPGMCheck = true; } - ScoreBrackets.simplifyWaitcnt(Wait); + ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false); auto SuccessorIt = std::next(Inst.getIterator()); bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060f303a5..f015d3ad7811e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1074,8 +1074,6 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, AtomicOrdering Order) const { - bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); @@ -1149,21 +1147,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } - if (VMCnt || LGKMCnt) { - unsigned WaitCntImmediate = - AMDGPU::encodeWaitcnt(IV, - VMCnt ? 0 : getVmcntBitMask(IV), - getExpcntBitMask(IV), - LGKMCnt ? 0 : getLgkmcntBitMask(IV)); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) - .addImm(WaitCntImmediate); - Changed = true; - } + // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts + // will later use this marker to add additional waits such as those required + // from direct load to LDS (formerly known as LDS DMA). + unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt( + IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV), + LGKMCnt ? 0 : getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) + .addImm(WaitCntImmediate); if (Pos == Position::AFTER) --MI; - return Changed; + return true; } bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, @@ -1966,8 +1962,6 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, AtomicOrdering Order) const { - bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); @@ -2057,28 +2051,25 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } - if (VMCnt || LGKMCnt) { - unsigned WaitCntImmediate = - AMDGPU::encodeWaitcnt(IV, - VMCnt ? 0 : getVmcntBitMask(IV), - getExpcntBitMask(IV), - LGKMCnt ? 0 : getLgkmcntBitMask(IV)); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) - .addImm(WaitCntImmediate); - Changed = true; - } + // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts + // will later use this marker to add additional waits such as those required + // from direct load to LDS (formerly known as LDS DMA). + unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt( + IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV), + LGKMCnt ? 0 : getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) + .addImm(WaitCntImmediate); if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) .addImm(0); - Changed = true; } if (Pos == Position::AFTER) --MI; - return Changed; + return true; } bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, @@ -2287,8 +2278,6 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, AtomicOrdering Order) const { - bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); @@ -2372,23 +2361,26 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); - Changed = true; + } else { + // Always emit a soft wait count, even if it is trivially ~0. + // SIInsertWaitcnts will later use this marker to add additional waits such + // as those required from direct load to LDS (formerly known as LDS DMA). + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)) + .addImm(getLoadcntBitMask(IV)); } if (STORECnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); - Changed = true; } if (DSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); - Changed = true; } if (Pos == Position::AFTER) --MI; - return Changed; + return true; } bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 8a80afd4a768f..1bbbec977b714 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -880,8 +880,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 7 ; GFX10-NEXT: s_add_u32 s0, 0x100, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 @@ -921,8 +921,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_add_u32 s0, 0x100, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -991,8 +991,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 -; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x100, s0 ; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 @@ -1032,8 +1032,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc -; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX11-NEXT: s_add_u32 s0, 0x100, s0 ; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1520,8 +1520,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 ; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 7 ; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 @@ -1633,8 +1633,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 -; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x4004, s0 ; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba0..ea6a5d5e74d52 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -199,26 +199,32 @@ entry: define amdgpu_kernel void @singlethread_one_as_acquire() #0 { ; GFX6-LABEL: name: singlethread_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_acquire ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_one_as_acquire ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_one_as_acquire ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_one_as_acquire ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread-one-as") acquire @@ -228,26 +234,32 @@ entry: define amdgpu_kernel void @singlethread_one_as_release() #0 { ; GFX6-LABEL: name: singlethread_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_release ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_one_as_release ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_one_as_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread-one-as") release @@ -257,26 +269,32 @@ entry: define amdgpu_kernel void @singlethread_one_as_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_acq_rel ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_one_as_acq_rel ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_one_as_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread-one-as") acq_rel @@ -286,26 +304,32 @@ entry: define amdgpu_kernel void @singlethread_one_as_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_one_as_seq_cst ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_one_as_seq_cst ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_one_as_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread-one-as") seq_cst @@ -501,10 +525,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire() #0 { ; GFX6-LABEL: name: workgroup_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acquire @@ -516,6 +542,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_acquire ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acquire @@ -527,6 +554,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_acquire ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") acquire @@ -536,10 +564,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_release @@ -550,6 +580,7 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_release @@ -560,6 +591,7 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") release @@ -569,10 +601,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel @@ -584,6 +618,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel @@ -595,6 +630,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") acq_rel @@ -604,10 +640,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst @@ -619,6 +657,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst @@ -630,6 +669,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") seq_cst @@ -639,26 +679,32 @@ entry: define amdgpu_kernel void @wavefront_one_as_acquire() #0 { ; GFX6-LABEL: name: wavefront_one_as_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_acquire ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_one_as_acquire ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_one_as_acquire ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_one_as_acquire ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront-one-as") acquire @@ -668,26 +714,32 @@ entry: define amdgpu_kernel void @wavefront_one_as_release() #0 { ; GFX6-LABEL: name: wavefront_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_release ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_one_as_release ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_one_as_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront-one-as") release @@ -697,26 +749,32 @@ entry: define amdgpu_kernel void @wavefront_one_as_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_acq_rel ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_one_as_acq_rel ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_one_as_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront-one-as") acq_rel @@ -726,26 +784,32 @@ entry: define amdgpu_kernel void @wavefront_one_as_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_one_as_seq_cst ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_one_as_seq_cst ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_one_as_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront-one-as") seq_cst @@ -941,26 +1005,32 @@ entry: define amdgpu_kernel void @singlethread_acquire() #0 { ; GFX6-LABEL: name: singlethread_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_acquire ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_acquire ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_acquire ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_acquire ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread") acquire @@ -970,26 +1040,32 @@ entry: define amdgpu_kernel void @singlethread_release() #0 { ; GFX6-LABEL: name: singlethread_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_release ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_release ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread") release @@ -999,26 +1075,32 @@ entry: define amdgpu_kernel void @singlethread_acq_rel() #0 { ; GFX6-LABEL: name: singlethread_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_acq_rel ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_acq_rel ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread") acq_rel @@ -1028,26 +1110,32 @@ entry: define amdgpu_kernel void @singlethread_seq_cst() #0 { ; GFX6-LABEL: name: singlethread_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: singlethread_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: singlethread_seq_cst ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: singlethread_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: singlethread_seq_cst ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: singlethread_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("singlethread") seq_cst @@ -1397,26 +1485,32 @@ entry: define amdgpu_kernel void @wavefront_acquire() #0 { ; GFX6-LABEL: name: wavefront_acquire ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_acquire ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_acquire ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_acquire ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_acquire ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_acquire ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront") acquire @@ -1426,26 +1520,32 @@ entry: define amdgpu_kernel void @wavefront_release() #0 { ; GFX6-LABEL: name: wavefront_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_release ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_release ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront") release @@ -1455,26 +1555,32 @@ entry: define amdgpu_kernel void @wavefront_acq_rel() #0 { ; GFX6-LABEL: name: wavefront_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_acq_rel ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_acq_rel ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront") acq_rel @@ -1484,26 +1590,32 @@ entry: define amdgpu_kernel void @wavefront_seq_cst() #0 { ; GFX6-LABEL: name: wavefront_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: wavefront_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: wavefront_seq_cst ; GFX10WGP: bb.0.entry: + ; GFX10WGP-NEXT: S_WAITCNT_soft 65407 ; GFX10WGP-NEXT: S_ENDPGM 0 ; ; GFX10CU-LABEL: name: wavefront_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: wavefront_seq_cst ; GFX11WGP: bb.0.entry: + ; GFX11WGP-NEXT: S_WAITCNT_soft 65527 ; GFX11WGP-NEXT: S_ENDPGM 0 ; ; GFX11CU-LABEL: name: wavefront_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("wavefront") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll index 51caa84450ff3..a713450809ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll @@ -93,8 +93,8 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo ; GFX10-NEXT: flat_store_dword v[0:1], v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof @@ -723,8 +723,8 @@ define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 { ; GFX10-NEXT: v_mov_b32_e32 v2, 7 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: flat_store_dword v[0:1], v2 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) store volatile i32 7, ptr %1, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index acf2f8add7670..9feb029b9bed3 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -5064,8 +5064,8 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc -; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_nop 0 @@ -5095,8 +5095,8 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc -; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index b0439b1f7968f..9aa19555bcbe0 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -51,8 +51,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { ; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 @@ -70,8 +70,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { ; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 @@ -135,8 +135,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i ; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 @@ -155,8 +155,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i ; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s0, 15 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16 @@ -216,8 +216,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned( ; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 @@ -235,8 +235,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned( ; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b5e579b78a59c..8999e91208b3a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -1946,8 +1946,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 ; GFX10-NEXT: scratch_store_dword v1, v2, off offset:128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 7 ; GFX10-NEXT: s_addk_i32 s0, 0x100 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 @@ -1963,8 +1963,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_addk_i32 s0, 0x100 @@ -2080,8 +2080,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 ; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 -; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x100 ; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 @@ -2097,8 +2097,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc -; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-PAL-NEXT: s_addk_i32 s0, 0x100 @@ -3242,8 +3242,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 ; GFX10-NEXT: scratch_store_dword v1, v2, off offset:128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 7 ; GFX10-NEXT: s_addk_i32 s0, 0x4004 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 @@ -3379,8 +3379,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 ; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 -; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 ; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index a901d7f97eb37..d4fd7fe1adc7d 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -2943,11 +2943,11 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3072,21 +3072,21 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v36 ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc -; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: buffer_store_b16 v33, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v33, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -3111,8 +3111,8 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; GFX11-FAKE16-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc -; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 1, v32 ; GFX11-FAKE16-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3123,17 +3123,17 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc -; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-FAKE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v36, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -3207,11 +3207,11 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b64 v[34:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3357,17 +3357,17 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b32 v36, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v36, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3523,11 +3523,11 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3606,11 +3606,11 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3808,17 +3808,17 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4106,29 +4106,29 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: buffer_store_b128 v[80:83], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[80:83], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: buffer_store_b128 v[68:71], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[68:71], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: buffer_store_b128 v[64:67], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[64:67], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4587,53 +4587,53 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-TRUE16-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: buffer_store_b8 v55, off, s[0:3], 0 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -4677,53 +4677,53 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-FAKE16-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: buffer_store_b8 v55, off, s[0:3], 0 dlc ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 17a5f520ff41e..5ae466a4ca188 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -7802,10 +7802,12 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; NOOPT-NEXT: ; implicit-def: $sgpr0 ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 ; NOOPT-NEXT: ds_write_b32 v0, v2 +; NOOPT-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; NOOPT-NEXT: s_mov_b32 m0, -1 ; NOOPT-NEXT: ; implicit-def: $sgpr0 ; NOOPT-NEXT: v_mov_b32_e32 v0, s0 ; NOOPT-NEXT: ds_write_b32 v0, v1 +; NOOPT-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; NOOPT-NEXT: s_endpgm ; ; SI-MOVREL-LABEL: multi_same_block: diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 0681263b7428e..71e85e4b948d8 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -71,6 +71,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b32 v0, v1 +; CHECK-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %end ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 44415657b6336..d319acb064f31 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -167,6 +167,7 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX8-NOOPT-NEXT: ds_read_b32 v0, v3 ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOOPT-NEXT: s_barrier +; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[0:1], v0, v0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 1379eb61e0853..eca04e1a396a0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -16,10 +16,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acquire_fence: @@ -31,14 +33,17 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX10-CU-LABEL: workgroup_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -49,6 +54,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acquire_fence: @@ -66,6 +72,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX11-CU-LABEL: workgroup_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_acquire_fence: @@ -86,10 +93,12 @@ entry: define amdgpu_kernel void @workgroup_release_fence() { ; GFX6-LABEL: workgroup_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_release_fence: @@ -100,14 +109,17 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX10-CU-LABEL: workgroup_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: @@ -117,6 +129,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: @@ -132,6 +145,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX11-CU-LABEL: workgroup_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_release_fence: @@ -153,10 +167,12 @@ entry: define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX6-LABEL: workgroup_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acq_rel_fence: @@ -168,14 +184,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -186,6 +205,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -203,6 +223,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: @@ -225,10 +246,12 @@ entry: define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX6-LABEL: workgroup_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_seq_cst_fence: @@ -240,14 +263,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -258,6 +284,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -275,6 +302,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: @@ -297,10 +325,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX6-LABEL: workgroup_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -312,14 +342,17 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -330,6 +363,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -347,6 +381,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -367,10 +402,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -381,14 +418,17 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -398,6 +438,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -413,6 +454,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -434,10 +476,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -449,14 +493,17 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -467,6 +514,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -484,6 +532,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -506,10 +555,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -521,14 +572,17 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -539,6 +593,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -556,6 +611,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 971015b391ca8..c81b47a32067d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -46,6 +46,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acquire_fence: @@ -55,6 +56,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX942-TGSPLIT-LABEL: workgroup_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_acquire_fence: @@ -114,6 +116,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: @@ -123,6 +126,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_release_fence: @@ -180,6 +184,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -189,6 +194,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_acq_rel_fence: @@ -246,6 +252,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -255,6 +262,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_seq_cst_fence: @@ -282,46 +290,57 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX6-LABEL: workgroup_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_one_as_acquire_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -339,46 +358,57 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -396,46 +426,57 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -453,46 +494,57 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -540,6 +592,7 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; GFX90A-TGSPLIT-LABEL: agent_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_acquire_fence: @@ -549,6 +602,7 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; GFX942-TGSPLIT-LABEL: agent_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_acquire_fence: @@ -608,6 +662,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX90A-TGSPLIT-LABEL: agent_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_release_fence: @@ -617,6 +672,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX942-TGSPLIT-LABEL: agent_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_release_fence: @@ -674,6 +730,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_acq_rel_fence: @@ -683,6 +740,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX942-TGSPLIT-LABEL: agent_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_acq_rel_fence: @@ -740,6 +798,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_seq_cst_fence: @@ -749,6 +808,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX942-TGSPLIT-LABEL: agent_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_seq_cst_fence: @@ -776,46 +836,57 @@ entry: define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX6-LABEL: agent_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: agent_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_one_as_acquire_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: agent_one_as_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: agent_one_as_acquire_fence: @@ -833,46 +904,57 @@ entry: define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX6-LABEL: agent_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: agent_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: agent_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: agent_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: agent_one_as_release_fence: @@ -890,46 +972,57 @@ entry: define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX6-LABEL: agent_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence: @@ -947,46 +1040,57 @@ entry: define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX6-LABEL: agent_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence: @@ -1034,6 +1138,7 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; GFX90A-TGSPLIT-LABEL: system_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_acquire_fence: @@ -1043,6 +1148,7 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; GFX942-TGSPLIT-LABEL: system_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_acquire_fence: @@ -1102,6 +1208,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX90A-TGSPLIT-LABEL: system_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_release_fence: @@ -1111,6 +1218,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX942-TGSPLIT-LABEL: system_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_release_fence: @@ -1168,6 +1276,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_acq_rel_fence: @@ -1177,6 +1286,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX942-TGSPLIT-LABEL: system_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_acq_rel_fence: @@ -1234,6 +1344,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_seq_cst_fence: @@ -1243,6 +1354,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX942-TGSPLIT-LABEL: system_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_seq_cst_fence: @@ -1270,46 +1382,57 @@ entry: define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX6-LABEL: system_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: system_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_one_as_acquire_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: system_one_as_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: system_one_as_acquire_fence: @@ -1327,46 +1450,57 @@ entry: define amdgpu_kernel void @system_one_as_release_fence() { ; GFX6-LABEL: system_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: system_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: system_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: system_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: system_one_as_release_fence: @@ -1384,46 +1518,57 @@ entry: define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX6-LABEL: system_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: system_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: system_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence: @@ -1441,46 +1586,57 @@ entry: define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX6-LABEL: system_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: system_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: system_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 0e459ed0f1243..c7ae24e5a56a0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -16,46 +16,57 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_acquire_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_acquire_fence: @@ -73,46 +84,57 @@ entry: define amdgpu_kernel void @singlethread_release_fence() { ; GFX6-LABEL: singlethread_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_release_fence: @@ -130,46 +152,57 @@ entry: define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX6-LABEL: singlethread_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_acq_rel_fence: @@ -187,46 +220,57 @@ entry: define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX6-LABEL: singlethread_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_seq_cst_fence: @@ -244,46 +288,57 @@ entry: define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX6-LABEL: singlethread_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_one_as_acquire_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_one_as_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_one_as_acquire_fence: @@ -301,46 +356,57 @@ entry: define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX6-LABEL: singlethread_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_one_as_release_fence: @@ -358,46 +424,57 @@ entry: define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX6-LABEL: singlethread_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_one_as_acq_rel_fence: @@ -415,46 +492,57 @@ entry: define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX6-LABEL: singlethread_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: singlethread_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: singlethread_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: singlethread_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: singlethread_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: singlethread_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: singlethread_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: singlethread_one_as_seq_cst_fence: @@ -472,46 +560,57 @@ entry: define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX6-LABEL: wavefront_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_acquire_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_acquire_fence: @@ -529,46 +628,57 @@ entry: define amdgpu_kernel void @wavefront_release_fence() { ; GFX6-LABEL: wavefront_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_release_fence: @@ -586,46 +696,57 @@ entry: define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX6-LABEL: wavefront_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_acq_rel_fence: @@ -643,46 +764,57 @@ entry: define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX6-LABEL: wavefront_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_seq_cst_fence: @@ -700,46 +832,57 @@ entry: define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX6-LABEL: wavefront_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_acquire_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_one_as_acquire_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_one_as_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_one_as_acquire_fence: @@ -757,46 +900,57 @@ entry: define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX6-LABEL: wavefront_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_one_as_release_fence: @@ -814,46 +968,57 @@ entry: define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX6-LABEL: wavefront_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_one_as_acq_rel_fence: @@ -871,46 +1036,57 @@ entry: define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX6-LABEL: wavefront_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: wavefront_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: wavefront_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: wavefront_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: wavefront_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: wavefront_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: wavefront_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: wavefront_one_as_seq_cst_fence: @@ -1241,10 +1417,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX6-LABEL: workgroup_one_as_acquire_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acquire_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -1256,14 +1434,17 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -1274,6 +1455,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: @@ -1291,6 +1473,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acquire_fence: @@ -1311,10 +1494,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -1325,14 +1510,17 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1342,6 +1530,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1357,6 +1546,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -1378,10 +1568,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -1393,14 +1585,17 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1411,6 +1606,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1428,6 +1624,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -1450,10 +1647,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -1465,14 +1664,17 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -1483,6 +1685,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -1500,6 +1703,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 07ad8cb0c4a3d..5389c052a7fe9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -13656,6 +13656,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13674,6 +13675,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13752,6 +13754,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13766,6 +13769,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14008,6 +14012,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14028,6 +14033,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14115,6 +14121,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14131,6 +14138,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14208,6 +14216,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14228,6 +14237,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14315,6 +14325,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14331,6 +14342,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15384,6 +15396,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15416,6 +15429,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15528,6 +15542,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15546,6 +15561,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15914,6 +15930,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15948,6 +15965,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16069,6 +16087,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16089,6 +16108,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16203,6 +16223,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16237,6 +16258,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16358,6 +16380,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16378,6 +16401,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16489,6 +16513,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16521,6 +16546,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16633,6 +16659,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16651,6 +16678,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16754,6 +16782,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16786,6 +16815,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16898,6 +16928,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16916,6 +16947,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17022,6 +17054,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17056,6 +17089,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17177,6 +17211,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17197,6 +17232,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17311,6 +17347,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17345,6 +17382,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17466,6 +17504,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17486,6 +17525,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17600,6 +17640,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17634,6 +17675,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17755,6 +17797,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17775,6 +17818,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17889,6 +17933,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17923,6 +17968,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18044,6 +18090,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18064,6 +18111,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18178,6 +18226,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18212,6 +18261,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18333,6 +18383,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18353,6 +18404,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18467,6 +18519,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18501,6 +18554,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18622,6 +18676,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18642,6 +18697,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18756,6 +18812,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18790,6 +18847,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18911,6 +18969,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18931,6 +18990,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -19045,6 +19105,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -19079,6 +19140,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -19200,6 +19262,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -19220,6 +19283,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b88a10ab24a98..78c7fe4484a4f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -388,6 +388,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -406,6 +407,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -424,6 +426,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -438,6 +441,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -453,6 +457,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -467,6 +472,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -479,6 +485,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -491,6 +498,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -504,6 +512,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -518,6 +527,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -569,7 +579,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -587,7 +599,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -605,7 +619,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -619,7 +635,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -634,7 +652,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -648,7 +668,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -660,7 +682,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -672,7 +696,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -685,7 +711,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -699,7 +727,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1050,6 +1080,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1065,6 +1096,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1080,6 +1112,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1091,6 +1124,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1103,6 +1137,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1115,6 +1150,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1125,6 +1161,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1135,6 +1172,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1146,6 +1184,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1157,6 +1196,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1199,6 +1239,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1214,6 +1255,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1229,6 +1271,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1240,6 +1283,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1252,6 +1296,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1264,6 +1309,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1274,6 +1320,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1284,6 +1331,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1295,6 +1343,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1306,6 +1355,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1498,6 +1548,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1513,6 +1564,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1528,6 +1580,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1539,6 +1592,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1551,6 +1605,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1563,6 +1618,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1573,6 +1629,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1583,6 +1640,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1594,6 +1652,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1605,6 +1664,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1646,6 +1706,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1661,6 +1722,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1676,6 +1738,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1687,6 +1750,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1699,6 +1763,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1711,6 +1776,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1721,6 +1787,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1731,6 +1798,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1742,6 +1810,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1753,6 +1822,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1795,7 +1865,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1810,7 +1882,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1825,7 +1899,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1836,7 +1912,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1848,7 +1926,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1860,7 +1940,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1870,7 +1952,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1880,7 +1964,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1891,7 +1977,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1902,7 +1990,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1944,7 +2034,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1959,7 +2051,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1974,7 +2068,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1985,7 +2081,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1997,7 +2095,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2009,7 +2109,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2019,7 +2121,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2029,7 +2133,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2040,7 +2146,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2051,7 +2159,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -2094,6 +2204,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2113,6 +2224,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2132,6 +2244,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2147,6 +2260,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2163,6 +2277,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2178,6 +2293,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2191,6 +2307,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2204,6 +2321,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2218,6 +2336,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2233,6 +2352,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2287,7 +2407,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2306,7 +2428,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2325,7 +2449,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2340,7 +2466,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2356,7 +2484,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2371,7 +2501,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2384,7 +2516,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2397,7 +2531,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2411,7 +2547,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2426,7 +2564,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2481,7 +2621,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2500,7 +2642,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2519,7 +2663,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2534,7 +2680,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2550,7 +2698,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2565,7 +2715,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2578,7 +2730,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2591,7 +2745,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2605,7 +2761,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2620,7 +2778,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2928,6 +3088,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -2957,6 +3118,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -2986,6 +3148,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3011,6 +3174,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3027,6 +3191,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3043,6 +3208,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3057,6 +3223,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3071,6 +3238,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3086,6 +3254,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3101,6 +3270,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -3165,6 +3335,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3194,6 +3365,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; @@ -3223,6 +3395,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -3248,6 +3421,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3264,6 +3438,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3280,6 +3455,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3294,6 +3470,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3308,6 +3485,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3323,6 +3501,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -3338,6 +3517,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3403,7 +3583,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3432,7 +3614,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3461,7 +3645,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3486,7 +3672,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3502,7 +3690,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3518,7 +3708,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3532,7 +3724,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3546,7 +3740,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3561,7 +3757,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3576,7 +3774,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -3641,7 +3841,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3670,7 +3872,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3699,7 +3903,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3724,7 +3930,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3740,7 +3948,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3756,7 +3966,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3770,7 +3982,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3784,7 +3998,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3799,7 +4015,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3814,7 +4032,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -3880,6 +4100,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3909,6 +4130,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3938,6 +4160,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3963,6 +4186,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3979,6 +4203,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -3995,6 +4220,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4009,6 +4235,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4023,6 +4250,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4038,6 +4266,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4053,6 +4282,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -4118,6 +4348,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4147,6 +4378,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4176,6 +4408,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4201,6 +4434,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4217,6 +4451,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4233,6 +4468,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4247,6 +4483,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4261,6 +4498,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4276,6 +4514,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4291,6 +4530,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -4355,7 +4595,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4384,7 +4626,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4413,7 +4657,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4438,7 +4684,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4454,7 +4702,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4470,7 +4720,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4484,7 +4736,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4498,7 +4752,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4513,7 +4769,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4528,7 +4786,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -4593,7 +4853,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4622,7 +4884,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4651,7 +4915,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4676,7 +4942,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4692,7 +4960,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4708,7 +4978,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4722,7 +4994,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4736,7 +5010,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4751,7 +5027,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4766,7 +5044,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -4831,7 +5111,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4860,7 +5142,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4889,7 +5173,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4914,7 +5200,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4930,7 +5218,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4946,7 +5236,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4960,7 +5252,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4974,7 +5268,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -4989,7 +5285,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -5004,7 +5302,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -5069,7 +5369,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5098,7 +5400,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5127,7 +5431,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5152,7 +5458,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5168,7 +5476,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5184,7 +5494,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5198,7 +5510,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5212,7 +5526,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5227,7 +5543,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5242,7 +5560,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -5307,7 +5627,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5336,7 +5658,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5365,7 +5689,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5390,7 +5716,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5406,7 +5734,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5422,7 +5752,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5436,7 +5768,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5450,7 +5784,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5465,7 +5801,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5480,7 +5818,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -5545,7 +5885,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5574,7 +5916,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5603,7 +5947,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5628,7 +5974,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5644,7 +5992,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5660,7 +6010,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5674,7 +6026,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5688,7 +6042,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5703,7 +6059,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5718,7 +6076,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -5783,7 +6143,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5812,7 +6174,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5841,7 +6205,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5866,7 +6232,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5882,7 +6250,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5898,7 +6268,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5912,7 +6284,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5926,7 +6300,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5941,7 +6317,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5956,7 +6334,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -6021,7 +6401,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6050,7 +6432,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6079,7 +6463,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6104,7 +6490,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6120,7 +6508,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6136,7 +6526,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6150,7 +6542,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6164,7 +6558,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6179,7 +6575,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6194,7 +6592,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6544,6 +6944,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6577,6 +6978,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6610,6 +7012,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6639,6 +7042,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6659,6 +7063,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6678,6 +7083,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6695,6 +7101,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6712,6 +7119,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6730,6 +7138,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6749,6 +7158,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6827,6 +7237,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6860,6 +7271,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -6893,6 +7305,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -6922,6 +7335,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6942,6 +7356,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6961,6 +7376,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -6978,6 +7394,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6995,6 +7412,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -7013,6 +7431,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -7032,6 +7451,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -7111,7 +7531,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7144,7 +7566,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7177,7 +7601,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7206,7 +7632,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7226,7 +7654,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7245,7 +7675,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7262,7 +7694,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7279,7 +7713,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7297,7 +7733,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7316,7 +7754,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7395,7 +7835,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7428,7 +7870,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7461,7 +7905,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7490,7 +7936,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7510,7 +7958,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7529,7 +7979,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7546,7 +7998,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7563,7 +8017,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7581,7 +8037,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7600,7 +8058,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7680,6 +8140,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7713,6 +8174,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7746,6 +8208,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7775,6 +8238,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7795,6 +8259,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7814,6 +8279,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7831,6 +8297,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7848,6 +8315,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7866,6 +8334,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7885,6 +8354,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7964,6 +8434,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7997,6 +8468,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8030,6 +8502,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8059,6 +8532,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8079,6 +8553,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8098,6 +8573,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8115,6 +8591,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8132,6 +8609,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8150,6 +8628,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8169,6 +8648,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8247,7 +8727,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8280,7 +8762,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8313,7 +8797,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8342,7 +8828,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8362,7 +8850,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8381,7 +8871,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8398,7 +8890,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8415,7 +8909,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8433,7 +8929,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8452,7 +8950,9 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8531,7 +9031,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8564,7 +9066,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8597,7 +9101,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8626,7 +9132,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8646,7 +9154,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8665,7 +9175,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8682,7 +9194,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8699,7 +9213,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8717,7 +9233,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8736,7 +9254,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8815,7 +9335,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8848,7 +9370,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8881,7 +9405,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8910,7 +9436,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8930,7 +9458,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8949,7 +9479,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8966,7 +9498,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8983,7 +9517,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9001,7 +9537,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9020,7 +9558,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9099,7 +9639,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9132,7 +9674,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9165,7 +9709,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9194,7 +9740,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9214,7 +9762,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9233,7 +9783,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9250,7 +9802,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9267,7 +9821,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9285,7 +9841,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9304,7 +9862,9 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9383,7 +9943,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9416,7 +9978,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9449,7 +10013,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9478,7 +10044,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9498,7 +10066,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9517,7 +10087,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9534,7 +10106,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9551,7 +10125,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9569,7 +10145,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9588,7 +10166,9 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9667,7 +10247,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9700,7 +10282,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9733,7 +10317,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9762,7 +10348,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9782,7 +10370,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9801,7 +10391,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9818,7 +10410,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9835,7 +10429,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9853,7 +10449,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9872,7 +10470,9 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9951,7 +10551,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9984,7 +10586,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10017,7 +10621,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10046,7 +10652,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10066,7 +10674,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10085,7 +10695,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10102,7 +10714,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10119,7 +10733,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10137,7 +10753,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10156,7 +10774,9 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10235,7 +10855,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10268,7 +10890,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10301,7 +10925,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10330,7 +10956,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10350,7 +10978,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10369,7 +10999,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10386,7 +11018,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10403,7 +11037,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10421,7 +11057,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10440,7 +11078,9 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10869,6 +11509,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10887,6 +11528,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10905,6 +11547,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10919,6 +11562,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10934,6 +11578,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10948,6 +11593,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10960,6 +11606,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10972,6 +11619,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10985,6 +11633,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10999,6 +11648,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11050,7 +11700,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11068,7 +11720,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11086,7 +11740,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11100,7 +11756,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11115,7 +11773,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11129,7 +11789,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11141,7 +11803,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11153,7 +11817,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11166,7 +11832,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11180,7 +11848,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11531,6 +12201,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11546,6 +12217,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11561,6 +12233,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11572,6 +12245,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11584,6 +12258,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11596,6 +12271,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11606,6 +12282,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11616,6 +12293,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11627,6 +12305,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -11638,6 +12317,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11680,6 +12360,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11695,6 +12376,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11710,6 +12392,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11721,6 +12404,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +12417,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11745,6 +12430,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11755,6 +12441,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11765,6 +12452,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11776,6 +12464,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -11787,6 +12476,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11979,6 +12669,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -11994,6 +12685,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12009,6 +12701,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12020,6 +12713,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12032,6 +12726,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12044,6 +12739,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12054,6 +12750,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12064,6 +12761,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12075,6 +12773,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12086,6 +12785,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -12127,6 +12827,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12142,6 +12843,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -12157,6 +12859,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -12168,6 +12871,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12180,6 +12884,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12192,6 +12897,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -12202,6 +12908,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12212,6 +12919,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12223,6 +12931,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12234,6 +12943,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -12276,7 +12986,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12291,7 +13003,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12306,7 +13020,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12317,7 +13033,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12329,7 +13047,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12341,7 +13061,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12351,7 +13073,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12361,7 +13085,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12372,7 +13098,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12383,7 +13111,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -12425,7 +13155,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12440,7 +13172,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12455,7 +13189,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12466,7 +13202,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12478,7 +13216,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12490,7 +13230,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12500,7 +13242,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12510,7 +13254,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12521,7 +13267,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12532,7 +13280,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -12575,6 +13325,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12594,6 +13345,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12613,6 +13365,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12628,6 +13381,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12644,6 +13398,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12659,6 +13414,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12672,6 +13428,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12685,6 +13442,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12699,6 +13457,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12714,6 +13473,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12768,7 +13528,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12787,7 +13549,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12806,7 +13570,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12821,7 +13587,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12837,7 +13605,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12852,7 +13622,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12865,7 +13637,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12878,7 +13652,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12892,7 +13668,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12907,7 +13685,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12962,7 +13742,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12981,7 +13763,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13000,7 +13784,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13015,7 +13801,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13031,7 +13819,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13046,7 +13836,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13059,7 +13851,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13072,7 +13866,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13086,7 +13882,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13101,7 +13899,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13409,6 +14209,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13438,6 +14239,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13467,6 +14269,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13492,6 +14295,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13508,6 +14312,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13524,6 +14329,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13538,6 +14344,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13552,6 +14359,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13567,6 +14375,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13582,6 +14391,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -13646,6 +14456,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13675,6 +14486,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; @@ -13704,6 +14516,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -13729,6 +14542,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13745,6 +14559,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13761,6 +14576,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13775,6 +14591,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13789,6 +14606,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13804,6 +14622,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13819,6 +14638,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13884,7 +14704,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13913,7 +14735,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13942,7 +14766,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13967,7 +14793,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13983,7 +14811,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13999,7 +14829,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14013,7 +14845,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14027,7 +14861,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14042,7 +14878,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14057,7 +14895,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -14122,7 +14962,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14151,7 +14993,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14180,7 +15024,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14205,7 +15051,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14221,7 +15069,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14237,7 +15087,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14251,7 +15103,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14265,7 +15119,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14280,7 +15136,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14295,7 +15153,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -14361,6 +15221,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14390,6 +15251,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14419,6 +15281,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14444,6 +15307,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14460,6 +15324,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14476,6 +15341,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14490,6 +15356,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14504,6 +15371,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14519,6 +15387,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14534,6 +15403,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -14599,6 +15469,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14628,6 +15499,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14657,6 +15529,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14682,6 +15555,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14698,6 +15572,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14714,6 +15589,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14728,6 +15604,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14742,6 +15619,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14757,6 +15635,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14772,6 +15651,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -14836,7 +15716,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14865,7 +15747,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14894,7 +15778,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14919,7 +15805,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14935,7 +15823,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14951,7 +15841,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14965,7 +15857,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14979,7 +15873,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -14994,7 +15890,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -15009,7 +15907,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -15074,7 +15974,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15103,7 +16005,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15132,7 +16036,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15157,7 +16063,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15173,7 +16081,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15189,7 +16099,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15203,7 +16115,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15217,7 +16131,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15232,7 +16148,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15247,7 +16165,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -15312,7 +16232,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15341,7 +16263,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15370,7 +16294,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15395,7 +16321,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15411,7 +16339,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15427,7 +16357,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15441,7 +16373,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15455,7 +16389,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15470,7 +16406,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15485,7 +16423,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -15550,7 +16490,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15579,7 +16521,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15608,7 +16552,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15633,7 +16579,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15649,7 +16597,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15665,7 +16615,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15679,7 +16631,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15693,7 +16647,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15708,7 +16664,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15723,7 +16681,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -15788,7 +16748,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15817,7 +16779,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15846,7 +16810,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15871,7 +16837,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15887,7 +16855,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15903,7 +16873,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15917,7 +16889,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15931,7 +16905,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15946,7 +16922,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15961,7 +16939,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -16026,7 +17006,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16055,7 +17037,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16084,7 +17068,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16109,7 +17095,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16125,7 +17113,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16141,7 +17131,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16155,7 +17147,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16169,7 +17163,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16184,7 +17180,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16199,7 +17197,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -16264,7 +17264,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16293,7 +17295,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16322,7 +17326,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16347,7 +17353,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16363,7 +17371,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16379,7 +17389,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16393,7 +17405,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16407,7 +17421,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16422,7 +17438,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16437,7 +17455,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -16502,7 +17522,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16531,7 +17553,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16560,7 +17584,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16585,7 +17611,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16601,7 +17629,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16617,7 +17647,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16631,7 +17663,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16645,7 +17679,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16660,7 +17696,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -16675,7 +17713,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -17025,6 +18065,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17058,6 +18099,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17091,6 +18133,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17120,6 +18163,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17140,6 +18184,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17159,6 +18204,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17176,6 +18222,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17193,6 +18240,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17211,6 +18259,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17230,6 +18279,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17308,6 +18358,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17341,6 +18392,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -17374,6 +18426,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -17403,6 +18456,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -17423,6 +18477,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17442,6 +18497,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -17459,6 +18515,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17476,6 +18533,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -17494,6 +18552,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -17513,6 +18572,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -17592,7 +18652,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17625,7 +18687,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17658,7 +18722,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17687,7 +18753,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17707,7 +18775,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17726,7 +18796,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17743,7 +18815,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17760,7 +18834,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17778,7 +18854,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17797,7 +18875,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17876,7 +18956,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17909,7 +18991,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17942,7 +19026,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17971,7 +19057,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17991,7 +19079,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18010,7 +19100,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18027,7 +19119,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18044,7 +19138,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18062,7 +19158,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18081,7 +19179,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18161,6 +19261,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18194,6 +19295,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18227,6 +19329,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18256,6 +19359,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18276,6 +19380,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18295,6 +19400,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18312,6 +19418,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18329,6 +19436,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18347,6 +19455,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18366,6 +19475,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18445,6 +19555,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18478,6 +19589,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18511,6 +19623,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18540,6 +19653,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18560,6 +19674,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18579,6 +19694,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18596,6 +19712,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18613,6 +19730,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18631,6 +19749,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18650,6 +19769,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18728,7 +19848,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18761,7 +19883,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18794,7 +19918,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18823,7 +19949,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18843,7 +19971,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18862,7 +19992,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18879,7 +20011,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18896,7 +20030,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18914,7 +20050,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18933,7 +20071,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19012,7 +20152,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19045,7 +20187,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19078,7 +20222,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19107,7 +20253,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19127,7 +20275,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19146,7 +20296,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19163,7 +20315,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19180,7 +20334,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19198,7 +20354,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19217,7 +20375,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19296,7 +20456,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19329,7 +20491,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19362,7 +20526,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19391,7 +20557,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19411,7 +20579,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19430,7 +20600,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19447,7 +20619,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19464,7 +20638,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19482,7 +20658,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19501,7 +20679,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19580,7 +20760,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19613,7 +20795,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19646,7 +20830,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19675,7 +20861,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19695,7 +20883,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19714,7 +20904,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19731,7 +20923,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19748,7 +20942,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19766,7 +20962,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19785,7 +20983,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19864,7 +21064,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19897,7 +21099,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19930,7 +21134,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19959,7 +21165,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19979,7 +21187,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19998,7 +21208,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20015,7 +21227,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20032,7 +21246,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20050,7 +21266,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20069,7 +21287,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20148,7 +21368,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20181,7 +21403,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20214,7 +21438,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20243,7 +21469,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20263,7 +21491,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20282,7 +21512,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20299,7 +21531,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20316,7 +21550,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20334,7 +21570,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20353,7 +21591,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20432,7 +21672,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20465,7 +21707,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20498,7 +21742,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20527,7 +21773,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20547,7 +21795,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20566,7 +21816,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20583,7 +21835,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20600,7 +21854,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20618,7 +21874,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20637,7 +21895,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20716,7 +21976,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20749,7 +22011,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20782,7 +22046,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20811,7 +22077,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20831,7 +22099,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20850,7 +22120,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20867,7 +22139,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20884,7 +22158,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20902,7 +22178,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20921,7 +22199,9 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 919fc3e8f4e4f..d61672ac06d99 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -13852,6 +13852,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13870,6 +13871,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13950,6 +13952,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13964,6 +13967,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14210,6 +14214,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14230,6 +14235,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14321,6 +14327,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14337,6 +14344,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14416,6 +14424,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14436,6 +14445,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14527,6 +14537,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14543,6 +14554,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15612,6 +15624,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15644,6 +15657,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15758,6 +15772,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15776,6 +15791,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16148,6 +16164,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16182,6 +16199,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16307,6 +16325,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16327,6 +16346,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16443,6 +16463,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16477,6 +16498,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16602,6 +16624,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16622,6 +16645,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16735,6 +16759,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16767,6 +16792,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16881,6 +16907,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16899,6 +16926,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17002,6 +17030,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17034,6 +17063,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17148,6 +17178,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17166,6 +17197,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17272,6 +17304,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17306,6 +17339,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17431,6 +17465,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17451,6 +17486,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17567,6 +17603,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17601,6 +17638,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17726,6 +17764,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17746,6 +17785,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17862,6 +17902,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17896,6 +17937,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18021,6 +18063,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18041,6 +18084,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18157,6 +18201,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18191,6 +18236,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18316,6 +18362,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18336,6 +18383,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18452,6 +18500,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18486,6 +18535,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18611,6 +18661,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18631,6 +18682,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18747,6 +18799,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18781,6 +18834,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18906,6 +18960,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18926,6 +18981,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -19042,6 +19098,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -19076,6 +19133,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -19201,6 +19259,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -19221,6 +19280,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -19337,6 +19397,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -19371,6 +19432,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -19496,6 +19558,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -19516,6 +19579,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index a88e0e217fdb4..d1d8d656378d3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -459,6 +459,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -478,6 +479,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; @@ -508,6 +510,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -523,6 +526,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; @@ -632,6 +636,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -664,6 +669,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; @@ -723,6 +729,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -753,6 +760,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 7c637a20ab47b..aba437208e920 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -388,6 +388,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -406,6 +407,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -424,6 +426,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -438,6 +441,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -453,6 +457,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -467,6 +472,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -479,6 +485,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -491,6 +498,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -504,6 +512,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -518,6 +527,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -569,7 +579,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -587,7 +599,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -605,7 +619,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -619,7 +635,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -634,7 +652,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -648,7 +668,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -660,7 +682,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -672,7 +696,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -685,7 +711,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -699,7 +727,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1050,6 +1080,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1065,6 +1096,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1080,6 +1112,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1091,6 +1124,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1103,6 +1137,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1115,6 +1150,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1125,6 +1161,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1135,6 +1172,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1146,6 +1184,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1157,6 +1196,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1199,6 +1239,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1214,6 +1255,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1229,6 +1271,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1240,6 +1283,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1252,6 +1296,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1264,6 +1309,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1274,6 +1320,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1284,6 +1331,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1295,6 +1343,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1306,6 +1355,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1498,6 +1548,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1513,6 +1564,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1528,6 +1580,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1539,6 +1592,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1551,6 +1605,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1563,6 +1618,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1573,6 +1629,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1583,6 +1640,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1594,6 +1652,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1605,6 +1664,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1646,6 +1706,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1661,6 +1722,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1676,6 +1738,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1687,6 +1750,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1699,6 +1763,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1711,6 +1776,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1721,6 +1787,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1731,6 +1798,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1742,6 +1810,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1753,6 +1822,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -1795,7 +1865,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1810,7 +1882,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1825,7 +1899,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1836,7 +1912,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1848,7 +1926,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1860,7 +1940,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1870,7 +1952,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1880,7 +1964,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1891,7 +1977,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1902,7 +1990,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1944,7 +2034,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1959,7 +2051,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1974,7 +2068,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1985,7 +2081,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1997,7 +2095,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2009,7 +2109,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2019,7 +2121,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2029,7 +2133,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2040,7 +2146,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2051,7 +2159,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -2094,6 +2204,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2113,6 +2224,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2132,6 +2244,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2147,6 +2260,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2163,6 +2277,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2178,6 +2293,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2191,6 +2307,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2204,6 +2321,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2218,6 +2336,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2233,6 +2352,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2287,7 +2407,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2306,7 +2428,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2325,7 +2449,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2340,7 +2466,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2356,7 +2484,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2371,7 +2501,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2384,7 +2516,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2397,7 +2531,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2411,7 +2547,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2426,7 +2564,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2481,7 +2621,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2500,7 +2642,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2519,7 +2663,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2534,7 +2680,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2550,7 +2698,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2565,7 +2715,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2578,7 +2730,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2591,7 +2745,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2605,7 +2761,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2620,7 +2778,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2928,6 +3088,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -2957,6 +3118,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -2986,6 +3148,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3011,6 +3174,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3027,6 +3191,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3043,6 +3208,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3057,6 +3223,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3071,6 +3238,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3086,6 +3254,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3101,6 +3270,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -3165,6 +3335,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3194,6 +3365,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; @@ -3223,6 +3395,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -3248,6 +3421,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3264,6 +3438,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3280,6 +3455,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3294,6 +3470,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3308,6 +3485,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3323,6 +3501,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -3338,6 +3517,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3403,7 +3583,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3432,7 +3614,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3461,7 +3645,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3486,7 +3672,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3502,7 +3690,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3518,7 +3708,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3532,7 +3724,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3546,7 +3740,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3561,7 +3757,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3576,7 +3774,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -3641,7 +3841,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3670,7 +3872,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3699,7 +3903,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3724,7 +3930,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3740,7 +3948,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3756,7 +3966,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3770,7 +3982,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3784,7 +3998,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3799,7 +4015,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3814,7 +4032,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -3880,6 +4100,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3909,6 +4130,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3938,6 +4160,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3963,6 +4186,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3979,6 +4203,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -3995,6 +4220,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4009,6 +4235,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4023,6 +4250,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4038,6 +4266,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4053,6 +4282,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -4118,6 +4348,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4147,6 +4378,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4176,6 +4408,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4201,6 +4434,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4217,6 +4451,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4233,6 +4468,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4247,6 +4483,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4261,6 +4498,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4276,6 +4514,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4291,6 +4530,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -4355,7 +4595,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4384,7 +4626,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4413,7 +4657,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4438,7 +4684,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4454,7 +4702,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4470,7 +4720,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4484,7 +4736,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4498,7 +4752,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4513,7 +4769,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4528,7 +4786,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -4593,7 +4853,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4622,7 +4884,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4651,7 +4915,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4676,7 +4942,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4692,7 +4960,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4708,7 +4978,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4722,7 +4994,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4736,7 +5010,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4751,7 +5027,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4766,7 +5044,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -4831,7 +5111,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4860,7 +5142,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4889,7 +5173,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4914,7 +5200,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4930,7 +5218,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4946,7 +5236,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4960,7 +5252,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4974,7 +5268,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -4989,7 +5285,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -5004,7 +5302,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -5069,7 +5369,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5098,7 +5400,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5127,7 +5431,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5152,7 +5458,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5168,7 +5476,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5184,7 +5494,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5198,7 +5510,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5212,7 +5526,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5227,7 +5543,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5242,7 +5560,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -5307,7 +5627,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5336,7 +5658,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5365,7 +5689,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5390,7 +5716,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5406,7 +5734,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5422,7 +5752,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5436,7 +5768,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5450,7 +5784,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5465,7 +5801,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5480,7 +5818,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -5545,7 +5885,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5574,7 +5916,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5603,7 +5947,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5628,7 +5974,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5644,7 +5992,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5660,7 +6010,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5674,7 +6026,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5688,7 +6042,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5703,7 +6059,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5718,7 +6076,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -5783,7 +6143,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5812,7 +6174,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5841,7 +6205,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5866,7 +6232,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5882,7 +6250,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5898,7 +6268,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5912,7 +6284,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5926,7 +6300,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5941,7 +6317,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5956,7 +6334,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -6021,7 +6401,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6050,7 +6432,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6079,7 +6463,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6104,7 +6490,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6120,7 +6508,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6136,7 +6526,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6150,7 +6542,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6164,7 +6558,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6179,7 +6575,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6194,7 +6592,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6544,6 +6944,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6577,6 +6978,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6610,6 +7012,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6639,6 +7042,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6659,6 +7063,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6678,6 +7083,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6695,6 +7101,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6712,6 +7119,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6730,6 +7138,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6749,6 +7158,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6827,6 +7237,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6860,6 +7271,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -6893,6 +7305,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -6922,6 +7335,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -6942,6 +7356,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6961,6 +7376,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -6978,6 +7394,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6995,6 +7412,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -7013,6 +7431,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -7032,6 +7451,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -7111,7 +7531,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7144,7 +7566,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7177,7 +7601,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7206,7 +7632,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7226,7 +7654,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7245,7 +7675,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7262,7 +7694,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7279,7 +7713,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7297,7 +7733,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7316,7 +7754,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7395,7 +7835,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7428,7 +7870,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7461,7 +7905,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7490,7 +7936,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7510,7 +7958,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7529,7 +7979,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7546,7 +7998,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7563,7 +8017,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7581,7 +8037,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7600,7 +8058,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7680,6 +8140,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7713,6 +8174,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7746,6 +8208,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7775,6 +8238,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7795,6 +8259,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7814,6 +8279,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7831,6 +8297,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7848,6 +8315,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7866,6 +8334,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7885,6 +8354,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7964,6 +8434,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7997,6 +8468,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8030,6 +8502,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8059,6 +8532,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8079,6 +8553,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8098,6 +8573,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8115,6 +8591,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8132,6 +8609,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8150,6 +8628,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8169,6 +8648,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8247,7 +8727,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8280,7 +8762,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8313,7 +8797,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8342,7 +8828,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8362,7 +8850,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8381,7 +8871,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8398,7 +8890,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8415,7 +8909,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8433,7 +8929,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8452,7 +8950,9 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8531,7 +9031,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8564,7 +9066,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8597,7 +9101,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8626,7 +9132,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8646,7 +9154,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8665,7 +9175,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8682,7 +9194,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8699,7 +9213,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8717,7 +9233,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8736,7 +9254,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8815,7 +9335,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8848,7 +9370,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8881,7 +9405,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8910,7 +9436,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8930,7 +9458,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8949,7 +9479,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8966,7 +9498,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -8983,7 +9517,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9001,7 +9537,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9020,7 +9558,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9099,7 +9639,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9132,7 +9674,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9165,7 +9709,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9194,7 +9740,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9214,7 +9762,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9233,7 +9783,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9250,7 +9802,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9267,7 +9821,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9285,7 +9841,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9304,7 +9862,9 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9383,7 +9943,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9416,7 +9978,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9449,7 +10013,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9478,7 +10044,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9498,7 +10066,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9517,7 +10087,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9534,7 +10106,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9551,7 +10125,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9569,7 +10145,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9588,7 +10166,9 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9667,7 +10247,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9700,7 +10282,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9733,7 +10317,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9762,7 +10348,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9782,7 +10370,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9801,7 +10391,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9818,7 +10410,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9835,7 +10429,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -9853,7 +10449,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9872,7 +10470,9 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9951,7 +10551,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9984,7 +10586,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10017,7 +10621,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10046,7 +10652,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10066,7 +10674,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10085,7 +10695,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10102,7 +10714,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10119,7 +10733,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10137,7 +10753,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10156,7 +10774,9 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10235,7 +10855,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10268,7 +10890,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10301,7 +10925,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10330,7 +10956,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10350,7 +10978,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10369,7 +10999,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10386,7 +11018,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10403,7 +11037,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10421,7 +11057,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10440,7 +11078,9 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10869,6 +11509,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10887,6 +11528,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10905,6 +11547,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10919,6 +11562,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10934,6 +11578,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10948,6 +11593,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10960,6 +11606,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10972,6 +11619,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10985,6 +11633,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10999,6 +11648,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11050,7 +11700,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11068,7 +11720,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11086,7 +11740,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11100,7 +11756,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11115,7 +11773,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11129,7 +11789,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11141,7 +11803,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11153,7 +11817,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11166,7 +11832,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11180,7 +11848,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11531,6 +12201,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11546,6 +12217,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11561,6 +12233,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11572,6 +12245,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11584,6 +12258,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11596,6 +12271,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11606,6 +12282,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11616,6 +12293,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11627,6 +12305,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -11638,6 +12317,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11680,6 +12360,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11695,6 +12376,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11710,6 +12392,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11721,6 +12404,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +12417,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11745,6 +12430,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11755,6 +12441,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11765,6 +12452,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11776,6 +12464,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -11787,6 +12476,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11979,6 +12669,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -11994,6 +12685,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12009,6 +12701,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12020,6 +12713,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12032,6 +12726,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12044,6 +12739,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12054,6 +12750,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12064,6 +12761,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12075,6 +12773,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12086,6 +12785,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -12127,6 +12827,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12142,6 +12843,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -12157,6 +12859,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -12168,6 +12871,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12180,6 +12884,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12192,6 +12897,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -12202,6 +12908,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12212,6 +12919,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12223,6 +12931,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12234,6 +12943,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -12276,7 +12986,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12291,7 +13003,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12306,7 +13020,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12317,7 +13033,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12329,7 +13047,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12341,7 +13061,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12351,7 +13073,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12361,7 +13085,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12372,7 +13098,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12383,7 +13111,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -12425,7 +13155,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12440,7 +13172,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12455,7 +13189,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12466,7 +13202,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12478,7 +13216,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12490,7 +13230,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12500,7 +13242,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12510,7 +13254,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12521,7 +13267,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12532,7 +13280,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -12575,6 +13325,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12594,6 +13345,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12613,6 +13365,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12628,6 +13381,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12644,6 +13398,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12659,6 +13414,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12672,6 +13428,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12685,6 +13442,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12699,6 +13457,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12714,6 +13473,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12768,7 +13528,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12787,7 +13549,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12806,7 +13570,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12821,7 +13587,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12837,7 +13605,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12852,7 +13622,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12865,7 +13637,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12878,7 +13652,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12892,7 +13668,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12907,7 +13685,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12962,7 +13742,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12981,7 +13763,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13000,7 +13784,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13015,7 +13801,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13031,7 +13819,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13046,7 +13836,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13059,7 +13851,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13072,7 +13866,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13086,7 +13882,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13101,7 +13899,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13409,6 +14209,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13438,6 +14239,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13467,6 +14269,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13492,6 +14295,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13508,6 +14312,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13524,6 +14329,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13538,6 +14344,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13552,6 +14359,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13567,6 +14375,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13582,6 +14391,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -13646,6 +14456,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13675,6 +14486,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; @@ -13704,6 +14516,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -13729,6 +14542,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13745,6 +14559,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13761,6 +14576,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13775,6 +14591,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13789,6 +14606,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13804,6 +14622,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13819,6 +14638,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13884,7 +14704,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13913,7 +14735,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13942,7 +14766,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13967,7 +14793,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13983,7 +14811,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13999,7 +14829,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14013,7 +14845,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14027,7 +14861,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14042,7 +14878,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14057,7 +14895,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -14122,7 +14962,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14151,7 +14993,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14180,7 +15024,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14205,7 +15051,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14221,7 +15069,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14237,7 +15087,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14251,7 +15103,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14265,7 +15119,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14280,7 +15136,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14295,7 +15153,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -14361,6 +15221,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14390,6 +15251,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14419,6 +15281,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14444,6 +15307,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14460,6 +15324,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14476,6 +15341,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14490,6 +15356,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14504,6 +15371,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14519,6 +15387,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14534,6 +15403,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -14599,6 +15469,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14628,6 +15499,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14657,6 +15529,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14682,6 +15555,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14698,6 +15572,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14714,6 +15589,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14728,6 +15604,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14742,6 +15619,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14757,6 +15635,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14772,6 +15651,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -14836,7 +15716,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14865,7 +15747,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14894,7 +15778,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14919,7 +15805,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14935,7 +15823,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14951,7 +15841,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14965,7 +15857,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14979,7 +15873,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -14994,7 +15890,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -15009,7 +15907,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -15074,7 +15974,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15103,7 +16005,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15132,7 +16036,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15157,7 +16063,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15173,7 +16081,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15189,7 +16099,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15203,7 +16115,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15217,7 +16131,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15232,7 +16148,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15247,7 +16165,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -15312,7 +16232,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15341,7 +16263,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15370,7 +16294,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15395,7 +16321,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15411,7 +16339,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15427,7 +16357,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15441,7 +16373,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15455,7 +16389,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15470,7 +16406,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15485,7 +16423,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -15550,7 +16490,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15579,7 +16521,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15608,7 +16552,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15633,7 +16579,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15649,7 +16597,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15665,7 +16615,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15679,7 +16631,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15693,7 +16647,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15708,7 +16664,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15723,7 +16681,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -15788,7 +16748,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15817,7 +16779,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15846,7 +16810,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15871,7 +16837,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15887,7 +16855,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15903,7 +16873,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15917,7 +16889,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15931,7 +16905,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15946,7 +16922,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15961,7 +16939,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -16026,7 +17006,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16055,7 +17037,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16084,7 +17068,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16109,7 +17095,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16125,7 +17113,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16141,7 +17131,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16155,7 +17147,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16169,7 +17163,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16184,7 +17180,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16199,7 +17197,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -16264,7 +17264,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16293,7 +17295,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16322,7 +17326,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16347,7 +17353,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16363,7 +17371,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16379,7 +17389,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16393,7 +17405,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16407,7 +17421,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16422,7 +17438,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16437,7 +17455,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -16502,7 +17522,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16531,7 +17553,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16560,7 +17584,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16585,7 +17611,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16601,7 +17629,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16617,7 +17647,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16631,7 +17663,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16645,7 +17679,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16660,7 +17696,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -16675,7 +17713,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -17025,6 +18065,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17058,6 +18099,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17091,6 +18133,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17120,6 +18163,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17140,6 +18184,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17159,6 +18204,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17176,6 +18222,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17193,6 +18240,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17211,6 +18259,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17230,6 +18279,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17308,7 +18358,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17341,7 +18393,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17374,7 +18428,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17403,7 +18459,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17423,7 +18481,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17442,7 +18502,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17459,7 +18521,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17476,7 +18540,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17494,7 +18560,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17513,7 +18581,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17592,7 +18662,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17625,7 +18697,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17658,7 +18732,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17687,7 +18763,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17707,7 +18785,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17726,7 +18806,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17743,7 +18825,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17760,7 +18844,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17778,7 +18864,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17797,7 +18885,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17877,6 +18967,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17910,6 +19001,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17943,6 +19035,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17972,6 +19065,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17992,6 +19086,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18011,6 +19106,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18028,6 +19124,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18045,6 +19142,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18063,6 +19161,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18082,6 +19181,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18161,6 +19261,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18194,6 +19295,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18227,6 +19329,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18256,6 +19359,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18276,6 +19380,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18295,6 +19400,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18312,6 +19418,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18329,6 +19436,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18347,6 +19455,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18366,6 +19475,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18444,7 +19554,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18477,7 +19589,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18510,7 +19624,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18539,7 +19655,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18559,7 +19677,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18578,7 +19698,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18595,7 +19717,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18612,7 +19736,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18630,7 +19756,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18649,7 +19777,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18728,7 +19858,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18761,7 +19893,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18794,7 +19928,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18823,7 +19959,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18843,7 +19981,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18862,7 +20002,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18879,7 +20021,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18896,7 +20040,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18914,7 +20060,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18933,7 +20081,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19012,7 +20162,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19045,7 +20197,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19078,7 +20232,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19107,7 +20263,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19127,7 +20285,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19146,7 +20306,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19163,7 +20325,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19180,7 +20344,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19198,7 +20364,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19217,7 +20385,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19296,7 +20466,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19329,7 +20501,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19362,7 +20536,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19391,7 +20567,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19411,7 +20589,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19430,7 +20610,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19447,7 +20629,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19464,7 +20648,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19482,7 +20668,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19501,7 +20689,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19580,7 +20770,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19613,7 +20805,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19646,7 +20840,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19675,7 +20871,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19695,7 +20893,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19714,7 +20914,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19731,7 +20933,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19748,7 +20952,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19766,7 +20972,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19785,7 +20993,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19864,7 +21074,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19897,7 +21109,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19930,7 +21144,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19959,7 +21175,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19979,7 +21197,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19998,7 +21218,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20015,7 +21237,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20032,7 +21256,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20050,7 +21276,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20069,7 +21297,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20148,7 +21378,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20181,7 +21413,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20214,7 +21448,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20243,7 +21479,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20263,7 +21501,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20282,7 +21522,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20299,7 +21541,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20316,7 +21560,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20334,7 +21580,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20353,7 +21601,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20432,7 +21682,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20465,7 +21717,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20498,7 +21752,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20527,7 +21783,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20547,7 +21805,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20566,7 +21826,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20583,7 +21845,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20600,7 +21864,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20618,7 +21884,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20637,7 +21905,9 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 0fd4aa4a7a93f..e84a44d31cf94 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -10829,6 +10829,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10867,6 +10868,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10881,6 +10883,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10896,6 +10899,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10923,6 +10927,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10965,6 +10970,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11018,7 +11024,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11058,7 +11066,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11072,7 +11082,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11087,7 +11099,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11115,7 +11129,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11160,7 +11176,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11519,6 +11537,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11551,6 +11570,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11562,6 +11582,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11574,6 +11595,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11597,6 +11619,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11632,6 +11655,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11678,6 +11702,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11710,6 +11735,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11721,6 +11747,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +11760,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11756,6 +11784,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11791,6 +11820,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11987,6 +12017,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12002,6 +12033,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12019,6 +12051,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12030,6 +12063,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12042,6 +12076,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12066,6 +12101,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12089,6 +12125,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12102,6 +12139,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: @@ -12145,6 +12183,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12177,6 +12216,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -12188,6 +12228,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12200,6 +12241,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12223,6 +12265,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12258,6 +12301,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -12304,7 +12348,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12322,6 +12368,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12338,7 +12385,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12349,7 +12398,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12361,7 +12412,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12386,7 +12439,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12413,6 +12468,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12425,7 +12481,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12473,7 +12531,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12491,6 +12551,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12507,7 +12568,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12518,7 +12581,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12530,7 +12595,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12555,7 +12622,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12582,6 +12651,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12594,7 +12664,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12643,6 +12715,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12683,6 +12756,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12698,6 +12772,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12714,6 +12789,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12743,6 +12819,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12788,6 +12865,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12844,7 +12922,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12886,7 +12966,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12901,7 +12983,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12917,7 +13001,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12947,7 +13033,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12995,7 +13083,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13058,7 +13148,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13100,7 +13192,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13115,7 +13209,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13131,7 +13227,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13161,7 +13259,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13209,7 +13309,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13525,6 +13627,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13554,6 +13657,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -13585,6 +13689,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13610,6 +13715,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13626,6 +13732,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13658,6 +13765,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13689,6 +13797,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -13706,6 +13815,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13772,6 +13882,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13832,6 +13943,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -13857,6 +13969,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13873,6 +13986,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13904,6 +14018,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13951,6 +14066,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -14020,7 +14136,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14052,6 +14170,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14082,7 +14201,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14107,7 +14228,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14123,7 +14246,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14156,7 +14281,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14191,6 +14318,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14207,7 +14335,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14278,7 +14408,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14310,6 +14442,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14340,7 +14473,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14365,7 +14500,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14381,7 +14518,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14414,7 +14553,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14449,6 +14590,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14465,7 +14607,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14537,6 +14681,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14566,6 +14711,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14597,6 +14743,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14622,6 +14769,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14638,6 +14786,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14670,6 +14819,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14701,6 +14851,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14718,6 +14869,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14785,6 +14937,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14814,6 +14967,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14845,6 +14999,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14870,6 +15025,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14886,6 +15042,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14918,6 +15075,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14949,6 +15107,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14966,6 +15125,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: @@ -15032,7 +15192,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15064,6 +15226,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15094,7 +15257,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15119,7 +15284,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15135,7 +15302,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15168,7 +15337,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15203,6 +15374,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15219,7 +15391,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15290,7 +15464,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15322,6 +15498,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15352,7 +15529,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15377,7 +15556,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15393,7 +15574,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15426,7 +15609,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15461,6 +15646,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15477,7 +15663,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15548,7 +15736,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15580,6 +15770,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15610,7 +15801,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15635,7 +15828,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15651,7 +15846,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15684,7 +15881,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15719,6 +15918,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15735,7 +15935,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15806,7 +16008,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15838,6 +16042,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15868,7 +16073,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15893,7 +16100,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15909,7 +16118,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15942,7 +16153,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15977,6 +16190,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15993,7 +16207,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -16064,7 +16280,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16096,6 +16314,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16126,7 +16345,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16151,7 +16372,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16167,7 +16390,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16200,7 +16425,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16235,6 +16462,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16251,7 +16479,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16322,7 +16552,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16354,6 +16586,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16384,7 +16617,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16409,7 +16644,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16425,7 +16662,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16458,7 +16697,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16493,6 +16734,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16509,7 +16751,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16580,7 +16824,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16612,6 +16858,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16642,7 +16889,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16667,7 +16916,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16683,7 +16934,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16716,7 +16969,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16751,6 +17006,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16767,7 +17023,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16838,7 +17096,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16870,6 +17130,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16900,7 +17161,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16925,7 +17188,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16941,7 +17206,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16974,7 +17241,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17009,6 +17278,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -17025,7 +17295,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17381,6 +17653,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17449,6 +17722,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17478,6 +17752,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17498,6 +17773,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17535,6 +17811,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -17592,6 +17869,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17672,6 +17950,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17740,6 +18019,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -17769,6 +18049,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -17789,6 +18070,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17826,6 +18108,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17883,6 +18166,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -17966,7 +18250,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18036,7 +18322,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18065,7 +18353,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18085,7 +18375,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18123,7 +18415,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18183,7 +18477,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18270,7 +18566,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18340,7 +18638,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18369,7 +18669,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18389,7 +18691,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18427,7 +18731,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18487,7 +18793,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18575,6 +18883,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18643,6 +18952,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18672,6 +18982,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18692,6 +19003,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18729,6 +19041,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18786,6 +19099,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18869,6 +19183,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18937,6 +19252,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18966,6 +19282,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18986,6 +19303,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19023,6 +19341,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19080,6 +19399,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19160,7 +19480,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19230,7 +19552,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19259,7 +19583,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19279,7 +19605,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19317,7 +19645,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19377,7 +19707,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19464,7 +19796,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19534,7 +19868,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19563,7 +19899,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19583,7 +19921,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19621,7 +19961,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19681,7 +20023,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19768,7 +20112,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19838,7 +20184,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19867,7 +20215,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19887,7 +20237,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19925,7 +20277,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19985,7 +20339,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20072,7 +20428,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20142,7 +20500,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20171,7 +20531,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20191,7 +20553,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20229,7 +20593,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20289,7 +20655,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20376,7 +20744,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20446,7 +20816,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20475,7 +20847,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20495,7 +20869,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20533,7 +20909,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20593,7 +20971,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20678,7 +21058,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20748,7 +21130,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20777,7 +21161,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20797,7 +21183,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20835,7 +21223,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20895,7 +21285,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20982,7 +21374,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21052,7 +21446,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21081,7 +21477,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21101,7 +21499,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21139,7 +21539,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21199,7 +21601,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21286,7 +21690,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21356,7 +21762,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21385,7 +21793,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21405,7 +21815,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21443,7 +21855,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21503,7 +21917,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 74a72e04fa4ae..fb1818a77837f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -1672,6 +1672,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1685,6 +1686,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1764,6 +1766,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1777,6 +1780,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2038,6 +2042,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2053,6 +2058,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2141,6 +2147,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2156,6 +2163,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2246,6 +2254,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2261,6 +2270,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2349,6 +2359,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2364,6 +2375,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3347,6 +3359,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3364,6 +3377,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3468,6 +3482,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3485,6 +3500,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3843,6 +3859,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3862,6 +3879,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3975,6 +3993,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3994,6 +4013,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4116,6 +4136,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4135,6 +4156,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4248,6 +4270,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4267,6 +4290,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4385,6 +4409,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4402,6 +4427,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4506,6 +4532,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4523,6 +4550,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4633,6 +4661,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4650,6 +4679,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4754,6 +4784,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4771,6 +4802,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4885,6 +4917,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4904,6 +4937,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5017,6 +5051,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5036,6 +5071,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5158,6 +5194,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5177,6 +5214,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5290,6 +5328,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5309,6 +5348,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5431,6 +5471,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5450,6 +5491,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5563,6 +5605,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5582,6 +5625,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5704,6 +5748,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5723,6 +5768,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5836,6 +5882,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5855,6 +5902,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5977,6 +6025,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5996,6 +6045,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6109,6 +6159,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6128,6 +6179,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6250,6 +6302,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6269,6 +6322,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6382,6 +6436,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6401,6 +6456,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6523,6 +6579,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6542,6 +6599,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6655,6 +6713,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6674,6 +6733,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6796,6 +6856,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6815,6 +6876,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6928,6 +6990,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6947,6 +7010,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12945,6 +13009,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -12958,6 +13023,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13037,6 +13103,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13050,6 +13117,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13311,6 +13379,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13326,6 +13395,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13414,6 +13484,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13429,6 +13500,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13519,6 +13591,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13534,6 +13607,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13622,6 +13696,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13637,6 +13712,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14620,6 +14696,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14637,6 +14714,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14741,6 +14819,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14758,6 +14837,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15116,6 +15196,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15135,6 +15216,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15248,6 +15330,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15267,6 +15350,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15389,6 +15473,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15408,6 +15493,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15521,6 +15607,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15540,6 +15627,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15658,6 +15746,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15675,6 +15764,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15779,6 +15869,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15796,6 +15887,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15906,6 +15998,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15923,6 +16016,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16027,6 +16121,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16044,6 +16139,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16158,6 +16254,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16177,6 +16274,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16290,6 +16388,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16309,6 +16408,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16431,6 +16531,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16450,6 +16551,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16563,6 +16665,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16582,6 +16685,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16704,6 +16808,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16723,6 +16828,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16836,6 +16942,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16855,6 +16962,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16977,6 +17085,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16996,6 +17105,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17109,6 +17219,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17128,6 +17239,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17250,6 +17362,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17269,6 +17382,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17382,6 +17496,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17401,6 +17516,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17523,6 +17639,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17542,6 +17659,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17655,6 +17773,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17674,6 +17793,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17796,6 +17916,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17815,6 +17936,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17928,6 +18050,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17947,6 +18070,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18069,6 +18193,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18088,6 +18213,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18201,6 +18327,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18220,6 +18347,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 8042d38716107..c80a926cbbd6e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -418,6 +418,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -586,6 +587,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -601,7 +603,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -650,6 +654,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1077,6 +1082,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1091,6 +1097,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1101,6 +1108,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -1111,6 +1119,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1129,6 +1138,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1139,6 +1149,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1149,6 +1160,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1159,6 +1171,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1169,6 +1182,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1179,6 +1193,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -1189,6 +1204,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1233,6 +1249,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1247,6 +1264,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1257,6 +1275,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -1267,6 +1286,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1285,6 +1305,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1295,6 +1316,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1305,6 +1327,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1315,6 +1338,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1325,6 +1349,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1335,6 +1360,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -1345,6 +1371,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1543,6 +1570,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_atomicrmw: @@ -1557,6 +1585,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acquire_atomicrmw: @@ -1567,6 +1596,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_atomicrmw: @@ -1577,6 +1607,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_atomicrmw: @@ -1594,6 +1625,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: @@ -1604,6 +1636,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: @@ -1614,6 +1647,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: @@ -1624,6 +1658,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: @@ -1634,6 +1669,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_atomicrmw: @@ -1644,6 +1680,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_atomicrmw: @@ -1654,6 +1691,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acquire_atomicrmw: @@ -1696,6 +1734,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1710,6 +1749,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1720,6 +1760,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -1730,6 +1771,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1747,6 +1789,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1757,6 +1800,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1767,6 +1811,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1777,6 +1822,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1787,6 +1833,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1797,6 +1844,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -1807,6 +1855,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1850,7 +1899,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1864,7 +1915,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1874,7 +1927,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1884,7 +1939,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1901,7 +1958,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1911,7 +1970,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1921,7 +1982,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1931,7 +1994,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1941,7 +2006,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1951,7 +2018,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1961,7 +2030,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -2004,7 +2075,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2018,7 +2091,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2028,7 +2103,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2038,7 +2115,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2055,7 +2134,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2065,7 +2146,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2075,7 +2158,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2085,7 +2170,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2095,7 +2182,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2105,7 +2194,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2115,7 +2206,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -2175,6 +2268,7 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2341,6 +2435,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2357,7 +2452,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2371,6 +2468,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -2383,6 +2481,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -2402,6 +2501,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2414,6 +2514,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2426,6 +2527,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2438,6 +2540,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2450,6 +2553,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2462,6 +2566,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2474,6 +2579,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2524,6 +2630,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2540,7 +2647,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2554,6 +2663,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -2566,6 +2676,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -2585,6 +2696,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2597,6 +2709,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2609,6 +2722,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2621,6 +2735,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2633,6 +2748,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2645,6 +2761,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2657,6 +2774,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2932,6 +3050,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -2960,6 +3079,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -2974,6 +3094,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -2988,6 +3109,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3010,6 +3132,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3024,6 +3147,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3038,6 +3162,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3052,6 +3177,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3066,6 +3192,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3080,6 +3207,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3094,6 +3222,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -3150,6 +3279,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3178,6 +3308,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3192,6 +3323,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3206,6 +3338,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -3228,6 +3361,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3242,6 +3376,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3256,6 +3391,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3270,6 +3406,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3284,6 +3421,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3298,6 +3436,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -3312,6 +3451,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3369,7 +3509,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3397,7 +3539,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3411,7 +3555,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3425,7 +3571,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3447,7 +3595,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3461,7 +3611,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3475,7 +3627,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3489,7 +3643,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3503,7 +3659,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3517,7 +3675,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3531,7 +3691,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -3588,7 +3750,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3616,7 +3780,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3630,7 +3796,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3644,7 +3812,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3666,7 +3836,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3680,7 +3852,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3694,7 +3868,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3708,7 +3884,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3722,7 +3900,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3736,7 +3916,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3750,7 +3932,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -3808,6 +3992,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3836,6 +4021,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3850,6 +4036,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3864,6 +4051,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3886,6 +4074,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3900,6 +4089,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3914,6 +4104,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3928,6 +4119,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3942,6 +4134,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3956,6 +4149,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -3970,6 +4164,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -4027,6 +4222,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4055,6 +4251,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4069,6 +4266,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4083,6 +4281,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4105,6 +4304,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4119,6 +4319,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4133,6 +4334,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4147,6 +4349,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4161,6 +4364,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4175,6 +4379,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4189,6 +4394,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -4245,7 +4451,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4273,7 +4481,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4287,7 +4497,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4301,7 +4513,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4323,7 +4537,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4337,7 +4553,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4351,7 +4569,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4365,7 +4585,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4379,7 +4601,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4393,7 +4617,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4407,7 +4633,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -4464,7 +4692,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4492,7 +4722,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4506,7 +4738,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4520,7 +4754,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4542,7 +4778,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4556,7 +4794,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4570,7 +4810,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4584,7 +4826,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4598,7 +4842,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4612,7 +4858,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4626,7 +4874,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -4683,7 +4933,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4711,7 +4963,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4725,7 +4979,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4739,7 +4995,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4761,7 +5019,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4775,7 +5035,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4789,7 +5051,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4803,7 +5067,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4817,7 +5083,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4831,7 +5099,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4845,7 +5115,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -4902,7 +5174,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -4930,7 +5204,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -4944,7 +5220,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -4958,7 +5236,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -4980,7 +5260,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -4994,7 +5276,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -5008,7 +5292,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -5022,7 +5308,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -5036,7 +5324,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -5050,7 +5340,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -5064,7 +5356,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -5121,7 +5415,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5149,7 +5445,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5163,7 +5461,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5177,7 +5477,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5199,7 +5501,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5213,7 +5517,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5227,7 +5533,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5241,7 +5549,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5255,7 +5565,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5269,7 +5581,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5283,7 +5597,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -5340,7 +5656,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5368,7 +5686,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5382,7 +5702,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5396,7 +5718,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5418,7 +5742,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5432,7 +5758,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5446,7 +5774,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5460,7 +5790,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5474,7 +5806,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5488,7 +5822,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5502,7 +5838,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -5559,7 +5897,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5587,7 +5927,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5601,7 +5943,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5615,7 +5959,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5637,7 +5983,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5651,7 +5999,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5665,7 +6015,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5679,7 +6031,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5693,7 +6047,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5707,7 +6063,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5721,7 +6079,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -5778,7 +6138,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5806,7 +6168,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5820,7 +6184,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5834,7 +6200,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5856,7 +6224,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5870,7 +6240,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5884,7 +6256,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5898,7 +6272,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5912,7 +6288,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5926,7 +6304,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5940,7 +6320,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -6249,8 +6631,8 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -6280,6 +6662,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6338,8 +6721,8 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6499,6 +6882,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6530,6 +6914,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6548,6 +6933,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -6564,6 +6950,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -6588,6 +6975,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6605,6 +6993,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6621,6 +7010,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6637,6 +7027,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6653,6 +7044,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6669,6 +7061,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6685,6 +7078,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6750,9 +7144,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -6781,7 +7176,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6799,6 +7196,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -6815,6 +7213,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -6839,9 +7238,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6856,6 +7256,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6872,6 +7273,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6888,6 +7290,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6904,6 +7307,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6920,6 +7324,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6936,6 +7341,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7001,9 +7407,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7032,7 +7439,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7050,6 +7459,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -7066,6 +7476,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -7090,9 +7501,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7107,6 +7519,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7123,6 +7536,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7139,6 +7553,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7155,6 +7570,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7171,6 +7587,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7187,6 +7604,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7253,8 +7671,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7284,6 +7702,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7342,8 +7761,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7504,8 +7923,8 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7535,6 +7954,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7593,8 +8013,8 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7754,9 +8174,10 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7785,7 +8206,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7803,6 +8226,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -7819,6 +8243,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -7843,9 +8268,10 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7860,6 +8286,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7876,6 +8303,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7892,6 +8320,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7908,6 +8337,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7924,6 +8354,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7940,6 +8371,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8005,9 +8437,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8036,7 +8469,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8054,6 +8489,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8070,6 +8506,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8094,9 +8531,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8111,6 +8549,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8127,6 +8566,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8143,6 +8583,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8159,6 +8600,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8175,6 +8617,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8191,6 +8634,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8256,9 +8700,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8287,7 +8732,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8305,6 +8752,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8321,6 +8769,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8345,9 +8794,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8362,6 +8812,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8378,6 +8829,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8394,6 +8846,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8410,6 +8863,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8426,6 +8880,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8442,6 +8897,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8507,9 +8963,10 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8538,7 +8995,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8556,6 +9015,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8572,6 +9032,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8596,9 +9057,10 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8613,6 +9075,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8629,6 +9092,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8645,6 +9109,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8661,6 +9126,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8677,6 +9143,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8693,6 +9160,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8758,9 +9226,10 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8789,7 +9258,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8807,6 +9278,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8823,6 +9295,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8847,9 +9320,10 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8864,6 +9338,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8880,6 +9355,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8896,6 +9372,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8912,6 +9389,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8928,6 +9406,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8944,6 +9423,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9009,9 +9489,10 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9040,7 +9521,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9058,6 +9541,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -9074,6 +9558,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9098,9 +9583,10 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9115,6 +9601,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9131,6 +9618,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9147,6 +9635,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9163,6 +9652,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9179,6 +9669,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9195,6 +9686,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9260,9 +9752,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9291,7 +9784,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9309,6 +9804,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -9325,6 +9821,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9349,9 +9846,10 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9366,6 +9864,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9382,6 +9881,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9398,6 +9898,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9414,6 +9915,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9430,6 +9932,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9446,6 +9949,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9511,9 +10015,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9542,7 +10047,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9560,6 +10067,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -9576,6 +10084,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9600,9 +10109,10 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9617,6 +10127,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9633,6 +10144,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9649,6 +10161,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9665,6 +10178,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9681,6 +10195,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9697,6 +10212,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10147,6 +10663,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10315,6 +10832,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -10330,7 +10848,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10379,6 +10899,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -10806,6 +11327,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10820,6 +11342,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10830,6 +11353,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -10840,6 +11364,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -10858,6 +11383,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10868,6 +11394,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10878,6 +11405,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10888,6 +11416,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10898,6 +11427,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10908,6 +11438,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -10918,6 +11449,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -10962,6 +11494,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10976,6 +11509,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10986,6 +11520,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -10996,6 +11531,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11014,6 +11550,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11024,6 +11561,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11034,6 +11572,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11044,6 +11583,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11054,6 +11594,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11064,6 +11605,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -11074,6 +11616,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -11272,6 +11815,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11286,6 +11830,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11296,6 +11841,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11306,6 +11852,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11323,6 +11870,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11333,6 +11881,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11343,6 +11892,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11353,6 +11903,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11363,6 +11914,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11373,6 +11925,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11383,6 +11936,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -11425,6 +11979,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11439,6 +11994,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11449,6 +12005,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -11459,6 +12016,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11476,6 +12034,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11486,6 +12045,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11496,6 +12056,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11506,6 +12067,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11516,6 +12078,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11526,6 +12089,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -11536,6 +12100,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -11579,7 +12144,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11593,7 +12160,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11603,7 +12172,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11613,7 +12184,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11630,7 +12203,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11640,7 +12215,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11650,7 +12227,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11660,7 +12239,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11670,7 +12251,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11680,7 +12263,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11690,7 +12275,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -11733,7 +12320,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11747,7 +12336,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11757,7 +12348,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11767,7 +12360,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11784,7 +12379,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11794,7 +12391,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11804,7 +12403,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11814,7 +12415,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11824,7 +12427,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11834,7 +12439,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11844,7 +12451,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -11904,6 +12513,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12070,6 +12680,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12086,7 +12697,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12100,6 +12713,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -12112,6 +12726,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -12131,6 +12746,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12143,6 +12759,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12155,6 +12772,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12167,6 +12785,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12179,6 +12798,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12191,6 +12811,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12203,6 +12824,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12253,6 +12875,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12269,7 +12892,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12283,6 +12908,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -12295,6 +12921,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -12314,6 +12941,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12326,6 +12954,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12338,6 +12967,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12350,6 +12980,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12362,6 +12993,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12374,6 +13006,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12386,6 +13019,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12661,6 +13295,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12689,6 +13324,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12703,6 +13339,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12717,6 +13354,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12739,6 +13377,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12753,6 +13392,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12767,6 +13407,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12781,6 +13422,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12795,6 +13437,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12809,6 +13452,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12823,6 +13467,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -12879,6 +13524,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -12907,6 +13553,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -12921,6 +13568,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -12935,6 +13583,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -12957,6 +13606,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12971,6 +13621,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12985,6 +13636,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -12999,6 +13651,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13013,6 +13666,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13027,6 +13681,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13041,6 +13696,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13098,7 +13754,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13126,7 +13784,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13140,7 +13800,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13154,7 +13816,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13176,7 +13840,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13190,7 +13856,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13204,7 +13872,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13218,7 +13888,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13232,7 +13904,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13246,7 +13920,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13260,7 +13936,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -13317,7 +13995,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13345,7 +14025,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13359,7 +14041,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13373,7 +14057,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13395,7 +14081,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13409,7 +14097,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13423,7 +14113,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13437,7 +14129,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13451,7 +14145,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13465,7 +14161,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13479,7 +14177,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -13537,6 +14237,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13565,6 +14266,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13579,6 +14281,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13593,6 +14296,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13615,6 +14319,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13629,6 +14334,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13643,6 +14349,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13657,6 +14364,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13671,6 +14379,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13685,6 +14394,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13699,6 +14409,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -13756,6 +14467,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13784,6 +14496,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13798,6 +14511,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13812,6 +14526,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13834,6 +14549,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13848,6 +14564,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13862,6 +14579,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13876,6 +14594,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13890,6 +14609,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13904,6 +14624,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13918,6 +14639,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -13974,7 +14696,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14002,7 +14726,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14016,7 +14742,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14030,7 +14758,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14052,7 +14782,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14066,7 +14798,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14080,7 +14814,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14094,7 +14830,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14108,7 +14846,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14122,7 +14862,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14136,7 +14878,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -14193,7 +14937,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14221,7 +14967,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14235,7 +14983,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14249,7 +14999,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14271,7 +15023,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14285,7 +15039,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14299,7 +15055,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14313,7 +15071,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14327,7 +15087,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14341,7 +15103,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14355,7 +15119,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -14412,7 +15178,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14440,7 +15208,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14454,7 +15224,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14468,7 +15240,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14490,7 +15264,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14504,7 +15280,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14518,7 +15296,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14532,7 +15312,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14546,7 +15328,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14560,7 +15344,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14574,7 +15360,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -14631,7 +15419,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14659,7 +15449,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14673,7 +15465,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14687,7 +15481,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14709,7 +15505,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14723,7 +15521,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14737,7 +15537,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14751,7 +15553,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14765,7 +15569,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14779,7 +15585,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14793,7 +15601,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -14850,7 +15660,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14878,7 +15690,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14892,7 +15706,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14906,7 +15722,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14928,7 +15746,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14942,7 +15762,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14956,7 +15778,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14970,7 +15794,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14984,7 +15810,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -14998,7 +15826,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15012,7 +15842,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -15069,7 +15901,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15097,7 +15931,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15111,7 +15947,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15125,7 +15963,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15147,7 +15987,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15161,7 +16003,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15175,7 +16019,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15189,7 +16035,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15203,7 +16051,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15217,7 +16067,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15231,7 +16083,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -15288,7 +16142,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15316,7 +16172,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15330,7 +16188,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15344,7 +16204,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15366,7 +16228,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15380,7 +16244,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15394,7 +16260,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15408,7 +16276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15422,7 +16292,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15436,7 +16308,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15450,7 +16324,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -15507,7 +16383,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15535,7 +16413,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15549,7 +16429,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15563,7 +16445,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15585,7 +16469,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15599,7 +16485,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15613,7 +16501,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15627,7 +16517,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15641,7 +16533,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15655,7 +16549,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15669,7 +16565,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -15978,8 +16876,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -16009,6 +16907,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16067,8 +16966,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16228,6 +17127,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16259,6 +17159,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16277,6 +17178,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -16293,6 +17195,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -16317,6 +17220,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16334,6 +17238,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16350,6 +17255,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16366,6 +17272,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16382,6 +17289,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16398,6 +17306,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16414,6 +17323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16479,9 +17389,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -16510,7 +17421,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16528,6 +17441,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -16544,6 +17458,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -16568,9 +17483,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16585,6 +17501,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16601,6 +17518,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16617,6 +17535,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16633,6 +17552,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16649,6 +17569,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16665,6 +17586,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16730,9 +17652,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -16761,7 +17684,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16779,6 +17704,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -16795,6 +17721,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -16819,9 +17746,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16836,6 +17764,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16852,6 +17781,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16868,6 +17798,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16884,6 +17815,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16900,6 +17832,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16916,6 +17849,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16982,8 +17916,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17013,6 +17947,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17071,8 +18006,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17233,8 +18168,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17264,6 +18199,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17322,8 +18258,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17483,9 +18419,10 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17514,7 +18451,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17532,6 +18471,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -17548,6 +18488,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17572,9 +18513,10 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17589,6 +18531,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17605,6 +18548,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17621,6 +18565,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17637,6 +18582,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17653,6 +18599,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17669,6 +18616,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17734,9 +18682,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17765,7 +18714,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17783,6 +18734,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -17799,6 +18751,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17823,9 +18776,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17840,6 +18794,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17856,6 +18811,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17872,6 +18828,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17888,6 +18845,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17904,6 +18862,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17920,6 +18879,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17985,9 +18945,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18016,7 +18977,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18034,6 +18997,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18050,6 +19014,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18074,9 +19039,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18091,6 +19057,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18107,6 +19074,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18123,6 +19091,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18139,6 +19108,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18155,6 +19125,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18171,6 +19142,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18236,9 +19208,10 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18267,7 +19240,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18285,6 +19260,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18301,6 +19277,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18325,9 +19302,10 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18342,6 +19320,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18358,6 +19337,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18374,6 +19354,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18390,6 +19371,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18406,6 +19388,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18422,6 +19405,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18487,9 +19471,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18518,7 +19503,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18536,6 +19523,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18552,6 +19540,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18576,9 +19565,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18593,6 +19583,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18609,6 +19600,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18625,6 +19617,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18641,6 +19634,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18657,6 +19651,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18673,6 +19668,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18738,9 +19734,10 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18769,7 +19766,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18787,6 +19786,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18803,6 +19803,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18827,9 +19828,10 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18844,6 +19846,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18860,6 +19863,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18876,6 +19880,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18892,6 +19897,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18908,6 +19914,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18924,6 +19931,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18989,9 +19997,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19020,7 +20029,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19038,6 +20049,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -19054,6 +20066,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19078,9 +20091,10 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19095,6 +20109,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19111,6 +20126,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19127,6 +20143,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19143,6 +20160,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19159,6 +20177,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19175,6 +20194,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19240,9 +20260,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19271,7 +20292,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19289,6 +20312,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -19305,6 +20329,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19329,9 +20354,10 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19346,6 +20372,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19362,6 +20389,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19378,6 +20406,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19394,6 +20423,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19410,6 +20440,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19426,6 +20457,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index be148464c156e..650e9432be81e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -1684,6 +1684,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1697,6 +1698,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1778,6 +1780,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1791,6 +1794,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2056,6 +2060,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2071,6 +2076,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2163,6 +2169,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2178,6 +2185,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2270,6 +2278,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2285,6 +2294,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2377,6 +2387,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2392,6 +2403,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3391,6 +3403,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3408,6 +3421,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3514,6 +3528,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3531,6 +3546,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3893,6 +3909,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3912,6 +3929,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4029,6 +4047,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4048,6 +4067,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4172,6 +4192,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4191,6 +4212,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4308,6 +4330,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4327,6 +4350,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4447,6 +4471,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4464,6 +4489,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4570,6 +4596,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4587,6 +4614,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4697,6 +4725,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4714,6 +4743,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4820,6 +4850,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4837,6 +4868,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4951,6 +4983,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4970,6 +5003,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5087,6 +5121,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5106,6 +5141,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5230,6 +5266,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5249,6 +5286,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5366,6 +5404,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5385,6 +5424,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5509,6 +5549,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5528,6 +5569,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5645,6 +5687,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5664,6 +5707,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5788,6 +5832,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5807,6 +5852,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5924,6 +5970,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5943,6 +5990,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11745,6 +11793,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -11758,6 +11807,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -11839,6 +11889,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11852,6 +11903,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12117,6 +12169,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -12132,6 +12185,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -12224,6 +12278,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12239,6 +12294,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12331,6 +12387,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -12346,6 +12403,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -12438,6 +12496,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12453,6 +12512,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13452,6 +13512,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13469,6 +13530,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13575,6 +13637,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13592,6 +13655,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13954,6 +14018,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13973,6 +14038,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14090,6 +14156,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14109,6 +14176,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14233,6 +14301,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14252,6 +14321,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14369,6 +14439,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14388,6 +14459,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14508,6 +14580,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14525,6 +14598,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14631,6 +14705,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14648,6 +14723,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14758,6 +14834,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14775,6 +14852,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14881,6 +14959,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14898,6 +14977,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15012,6 +15092,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15031,6 +15112,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15148,6 +15230,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15167,6 +15250,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15291,6 +15375,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15310,6 +15395,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15427,6 +15513,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15446,6 +15533,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15570,6 +15658,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15589,6 +15678,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15706,6 +15796,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15725,6 +15816,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15849,6 +15941,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15868,6 +15961,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15985,6 +16079,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16004,6 +16099,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16128,6 +16224,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16147,6 +16244,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16264,6 +16362,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16283,6 +16382,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16407,6 +16507,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16426,6 +16527,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16543,6 +16645,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16562,6 +16665,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16686,6 +16790,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16705,6 +16810,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16822,6 +16928,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16841,6 +16948,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16965,6 +17073,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16984,6 +17093,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17101,6 +17211,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17120,6 +17231,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 8a5c5dda9f79c..d44a6412971f6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -403,6 +403,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -416,6 +417,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; @@ -450,6 +452,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -463,6 +466,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; @@ -576,6 +580,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -590,6 +595,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; @@ -631,6 +637,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -647,6 +654,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; @@ -739,6 +747,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 151ba07a0b531..8e6559cfa8c7f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -418,6 +418,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -586,6 +587,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -601,7 +603,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -650,6 +654,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1077,6 +1082,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1091,6 +1097,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1101,6 +1108,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -1111,6 +1119,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1129,6 +1138,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1139,6 +1149,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1149,6 +1160,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1159,6 +1171,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1169,6 +1182,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1179,6 +1193,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -1189,6 +1204,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1233,6 +1249,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1247,6 +1264,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1257,6 +1275,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -1267,6 +1286,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1285,6 +1305,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1295,6 +1316,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1305,6 +1327,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1315,6 +1338,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1325,6 +1349,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1335,6 +1360,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -1345,6 +1371,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1543,6 +1570,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_atomicrmw: @@ -1557,6 +1585,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acquire_atomicrmw: @@ -1567,6 +1596,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_atomicrmw: @@ -1577,6 +1607,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_atomicrmw: @@ -1594,6 +1625,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: @@ -1604,6 +1636,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: @@ -1614,6 +1647,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: @@ -1624,6 +1658,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: @@ -1634,6 +1669,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_atomicrmw: @@ -1644,6 +1680,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_atomicrmw: @@ -1654,6 +1691,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acquire_atomicrmw: @@ -1696,6 +1734,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1710,6 +1749,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1720,6 +1760,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -1730,6 +1771,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -1747,6 +1789,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1757,6 +1800,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1767,6 +1811,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1777,6 +1822,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1787,6 +1833,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1797,6 +1844,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -1807,6 +1855,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -1850,7 +1899,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1864,7 +1915,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1874,7 +1927,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1884,7 +1939,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1901,7 +1958,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1911,7 +1970,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1921,7 +1982,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1931,7 +1994,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1941,7 +2006,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1951,7 +2018,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1961,7 +2030,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -2004,7 +2075,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2018,7 +2091,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2028,7 +2103,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2038,7 +2115,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2055,7 +2134,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2065,7 +2146,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2075,7 +2158,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2085,7 +2170,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2095,7 +2182,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2105,7 +2194,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2115,7 +2206,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -2175,6 +2268,7 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2341,6 +2435,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2357,7 +2452,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2371,6 +2468,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -2383,6 +2481,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -2402,6 +2501,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2414,6 +2514,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2426,6 +2527,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2438,6 +2540,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2450,6 +2553,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2462,6 +2566,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2474,6 +2579,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2524,6 +2630,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2540,7 +2647,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2554,6 +2663,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -2566,6 +2676,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -2585,6 +2696,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2597,6 +2709,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2609,6 +2722,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -2621,6 +2735,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2633,6 +2748,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -2645,6 +2761,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2657,6 +2774,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2932,6 +3050,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -2960,6 +3079,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -2974,6 +3094,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -2988,6 +3109,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3010,6 +3132,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3024,6 +3147,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3038,6 +3162,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3052,6 +3177,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3066,6 +3192,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3080,6 +3207,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3094,6 +3222,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -3150,6 +3279,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -3178,6 +3308,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -3192,6 +3323,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3206,6 +3338,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -3228,6 +3361,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -3242,6 +3376,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3256,6 +3391,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3270,6 +3406,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3284,6 +3421,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3298,6 +3436,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -3312,6 +3451,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3369,7 +3509,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3397,7 +3539,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3411,7 +3555,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3425,7 +3571,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3447,7 +3595,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3461,7 +3611,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3475,7 +3627,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3489,7 +3643,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3503,7 +3659,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3517,7 +3675,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3531,7 +3691,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -3588,7 +3750,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3616,7 +3780,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3630,7 +3796,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3644,7 +3812,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3666,7 +3836,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3680,7 +3852,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3694,7 +3868,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3708,7 +3884,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3722,7 +3900,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3736,7 +3916,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3750,7 +3932,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -3808,6 +3992,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3836,6 +4021,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3850,6 +4036,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3864,6 +4051,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3886,6 +4074,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3900,6 +4089,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3914,6 +4104,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3928,6 +4119,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3942,6 +4134,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3956,6 +4149,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -3970,6 +4164,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -4027,6 +4222,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4055,6 +4251,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4069,6 +4266,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4083,6 +4281,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4105,6 +4304,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4119,6 +4319,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4133,6 +4334,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4147,6 +4349,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4161,6 +4364,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4175,6 +4379,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4189,6 +4394,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -4245,7 +4451,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4273,7 +4481,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4287,7 +4497,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4301,7 +4513,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4323,7 +4537,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4337,7 +4553,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4351,7 +4569,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4365,7 +4585,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4379,7 +4601,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4393,7 +4617,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4407,7 +4633,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -4464,7 +4692,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4492,7 +4722,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4506,7 +4738,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4520,7 +4754,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4542,7 +4778,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4556,7 +4794,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4570,7 +4810,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4584,7 +4826,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4598,7 +4842,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4612,7 +4858,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4626,7 +4874,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -4683,7 +4933,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4711,7 +4963,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4725,7 +4979,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4739,7 +4995,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4761,7 +5019,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4775,7 +5035,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4789,7 +5051,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4803,7 +5067,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4817,7 +5083,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4831,7 +5099,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4845,7 +5115,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -4902,7 +5174,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -4930,7 +5204,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -4944,7 +5220,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -4958,7 +5236,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -4980,7 +5260,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -4994,7 +5276,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -5008,7 +5292,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -5022,7 +5308,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -5036,7 +5324,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -5050,7 +5340,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -5064,7 +5356,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -5121,7 +5415,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5149,7 +5445,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5163,7 +5461,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5177,7 +5477,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5199,7 +5501,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5213,7 +5517,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5227,7 +5533,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5241,7 +5549,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5255,7 +5565,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5269,7 +5581,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5283,7 +5597,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -5340,7 +5656,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5368,7 +5686,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5382,7 +5702,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5396,7 +5718,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5418,7 +5742,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5432,7 +5758,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5446,7 +5774,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5460,7 +5790,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5474,7 +5806,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5488,7 +5822,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5502,7 +5838,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -5559,7 +5897,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5587,7 +5927,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5601,7 +5943,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5615,7 +5959,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5637,7 +5983,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5651,7 +5999,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5665,7 +6015,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5679,7 +6031,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5693,7 +6047,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5707,7 +6063,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5721,7 +6079,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -5778,7 +6138,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5806,7 +6168,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5820,7 +6184,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5834,7 +6200,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5856,7 +6224,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5870,7 +6240,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5884,7 +6256,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5898,7 +6272,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5912,7 +6288,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5926,7 +6304,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5940,7 +6320,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -6249,8 +6631,8 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -6280,6 +6662,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6338,8 +6721,8 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6499,6 +6882,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6530,6 +6914,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6548,6 +6933,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -6564,6 +6950,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -6588,6 +6975,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6605,6 +6993,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6621,6 +7010,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6637,6 +7027,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6653,6 +7044,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6669,6 +7061,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6685,6 +7078,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6750,9 +7144,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -6781,7 +7176,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6799,6 +7196,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -6815,6 +7213,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -6839,9 +7238,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -6856,6 +7256,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6872,6 +7273,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -6888,6 +7290,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6904,6 +7307,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -6920,6 +7324,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6936,6 +7341,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7001,9 +7407,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7032,7 +7439,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7050,6 +7459,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -7066,6 +7476,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -7090,9 +7501,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7107,6 +7519,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7123,6 +7536,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7139,6 +7553,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7155,6 +7570,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7171,6 +7587,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7187,6 +7604,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7253,8 +7671,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7284,6 +7702,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7342,8 +7761,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7504,8 +7923,8 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7535,6 +7954,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7593,8 +8013,8 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7754,9 +8174,10 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7785,7 +8206,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7803,6 +8226,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -7819,6 +8243,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -7843,9 +8268,10 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7860,6 +8286,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7876,6 +8303,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -7892,6 +8320,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7908,6 +8337,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -7924,6 +8354,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7940,6 +8371,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8005,9 +8437,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8036,7 +8469,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8054,6 +8489,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8070,6 +8506,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8094,9 +8531,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8111,6 +8549,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8127,6 +8566,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8143,6 +8583,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8159,6 +8600,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8175,6 +8617,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8191,6 +8634,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8256,9 +8700,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8287,7 +8732,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8305,6 +8752,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8321,6 +8769,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8345,9 +8794,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8362,6 +8812,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8378,6 +8829,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8394,6 +8846,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8410,6 +8863,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8426,6 +8880,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8442,6 +8897,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8507,9 +8963,10 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8538,7 +8995,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8556,6 +9015,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8572,6 +9032,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8596,9 +9057,10 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8613,6 +9075,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8629,6 +9092,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8645,6 +9109,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8661,6 +9126,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8677,6 +9143,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8693,6 +9160,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8758,9 +9226,10 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8789,7 +9258,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8807,6 +9278,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -8823,6 +9295,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -8847,9 +9320,10 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8864,6 +9338,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8880,6 +9355,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -8896,6 +9372,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8912,6 +9389,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -8928,6 +9406,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8944,6 +9423,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9009,9 +9489,10 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9040,7 +9521,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9058,6 +9541,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -9074,6 +9558,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9098,9 +9583,10 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9115,6 +9601,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9131,6 +9618,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9147,6 +9635,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9163,6 +9652,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9179,6 +9669,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9195,6 +9686,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9260,9 +9752,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9291,7 +9784,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9309,6 +9804,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -9325,6 +9821,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9349,9 +9846,10 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9366,6 +9864,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9382,6 +9881,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9398,6 +9898,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9414,6 +9915,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9430,6 +9932,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9446,6 +9949,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9511,9 +10015,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9542,7 +10047,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9560,6 +10067,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -9576,6 +10084,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -9600,9 +10109,10 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9617,6 +10127,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9633,6 +10144,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -9649,6 +10161,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9665,6 +10178,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -9681,6 +10195,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9697,6 +10212,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10147,6 +10663,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10315,6 +10832,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -10330,7 +10848,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10379,6 +10899,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -10806,6 +11327,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10820,6 +11342,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10830,6 +11353,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -10840,6 +11364,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -10858,6 +11383,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10868,6 +11394,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10878,6 +11405,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10888,6 +11416,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10898,6 +11427,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10908,6 +11438,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -10918,6 +11449,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -10962,6 +11494,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10976,6 +11509,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -10986,6 +11520,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -10996,6 +11531,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11014,6 +11550,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11024,6 +11561,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11034,6 +11572,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11044,6 +11583,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11054,6 +11594,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11064,6 +11605,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -11074,6 +11616,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -11272,6 +11815,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11286,6 +11830,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11296,6 +11841,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11306,6 +11852,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11323,6 +11870,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11333,6 +11881,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11343,6 +11892,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11353,6 +11903,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11363,6 +11914,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11373,6 +11925,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11383,6 +11936,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -11425,6 +11979,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11439,6 +11994,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11449,6 +12005,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-WGP-NEXT: s_endpgm ; @@ -11459,6 +12016,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11476,6 +12034,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11486,6 +12045,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11496,6 +12056,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11506,6 +12067,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11516,6 +12078,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11526,6 +12089,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_endpgm ; @@ -11536,6 +12100,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -11579,7 +12144,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11593,7 +12160,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11603,7 +12172,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11613,7 +12184,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11630,7 +12203,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11640,7 +12215,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11650,7 +12227,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11660,7 +12239,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11670,7 +12251,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11680,7 +12263,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11690,7 +12275,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -11733,7 +12320,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11747,7 +12336,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11757,7 +12348,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11767,7 +12360,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11784,7 +12379,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11794,7 +12391,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11804,7 +12403,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11814,7 +12415,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11824,7 +12427,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11834,7 +12439,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11844,7 +12451,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -11904,6 +12513,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12070,6 +12680,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12086,7 +12697,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12100,6 +12713,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -12112,6 +12726,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -12131,6 +12746,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12143,6 +12759,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12155,6 +12772,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12167,6 +12785,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12179,6 +12798,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12191,6 +12811,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12203,6 +12824,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12253,6 +12875,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12269,7 +12892,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12283,6 +12908,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -12295,6 +12921,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -12314,6 +12941,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -12326,6 +12954,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12338,6 +12967,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -12350,6 +12980,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12362,6 +12993,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -12374,6 +13006,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12386,6 +13019,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12661,6 +13295,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12689,6 +13324,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12703,6 +13339,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12717,6 +13354,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12739,6 +13377,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12753,6 +13392,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12767,6 +13407,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12781,6 +13422,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12795,6 +13437,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12809,6 +13452,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12823,6 +13467,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -12879,6 +13524,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -12907,6 +13553,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -12921,6 +13568,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -12935,6 +13583,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -12957,6 +13606,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12971,6 +13621,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12985,6 +13636,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -12999,6 +13651,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13013,6 +13666,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13027,6 +13681,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13041,6 +13696,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13098,7 +13754,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13126,7 +13784,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13140,7 +13800,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13154,7 +13816,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13176,7 +13840,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13190,7 +13856,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13204,7 +13872,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13218,7 +13888,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13232,7 +13904,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13246,7 +13920,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13260,7 +13936,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -13317,7 +13995,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13345,7 +14025,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13359,7 +14041,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13373,7 +14057,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13395,7 +14081,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13409,7 +14097,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13423,7 +14113,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13437,7 +14129,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13451,7 +14145,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13465,7 +14161,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13479,7 +14177,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -13537,6 +14237,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13565,6 +14266,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13579,6 +14281,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13593,6 +14296,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13615,6 +14319,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13629,6 +14334,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13643,6 +14349,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13657,6 +14364,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13671,6 +14379,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13685,6 +14394,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13699,6 +14409,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -13756,6 +14467,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13784,6 +14496,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13798,6 +14511,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13812,6 +14526,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13834,6 +14549,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13848,6 +14564,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13862,6 +14579,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13876,6 +14594,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13890,6 +14609,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13904,6 +14624,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13918,6 +14639,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -13974,7 +14696,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14002,7 +14726,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14016,7 +14742,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14030,7 +14758,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14052,7 +14782,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14066,7 +14798,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14080,7 +14814,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14094,7 +14830,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14108,7 +14846,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14122,7 +14862,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14136,7 +14878,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -14193,7 +14937,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14221,7 +14967,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14235,7 +14983,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14249,7 +14999,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14271,7 +15023,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14285,7 +15039,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14299,7 +15055,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14313,7 +15071,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14327,7 +15087,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14341,7 +15103,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14355,7 +15119,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -14412,7 +15178,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14440,7 +15208,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14454,7 +15224,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14468,7 +15240,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14490,7 +15264,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14504,7 +15280,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14518,7 +15296,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14532,7 +15312,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14546,7 +15328,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14560,7 +15344,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14574,7 +15360,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -14631,7 +15419,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14659,7 +15449,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14673,7 +15465,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14687,7 +15481,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14709,7 +15505,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14723,7 +15521,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14737,7 +15537,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14751,7 +15553,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14765,7 +15569,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14779,7 +15585,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14793,7 +15601,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -14850,7 +15660,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14878,7 +15690,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14892,7 +15706,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14906,7 +15722,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14928,7 +15746,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14942,7 +15762,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14956,7 +15778,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14970,7 +15794,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14984,7 +15810,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -14998,7 +15826,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15012,7 +15842,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -15069,7 +15901,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15097,7 +15931,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15111,7 +15947,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15125,7 +15963,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15147,7 +15987,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15161,7 +16003,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15175,7 +16019,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15189,7 +16035,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15203,7 +16051,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15217,7 +16067,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15231,7 +16083,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -15288,7 +16142,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15316,7 +16172,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15330,7 +16188,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15344,7 +16204,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15366,7 +16228,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15380,7 +16244,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15394,7 +16260,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15408,7 +16276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15422,7 +16292,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15436,7 +16308,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15450,7 +16324,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -15507,7 +16383,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15535,7 +16413,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15549,7 +16429,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15563,7 +16445,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15585,7 +16469,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15599,7 +16485,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15613,7 +16501,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15627,7 +16517,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15641,7 +16533,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15655,7 +16549,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15669,7 +16565,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -15978,8 +16876,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -16009,6 +16907,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16067,8 +16966,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16228,6 +17127,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -16259,6 +17159,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -16277,6 +17178,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -16293,6 +17195,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -16317,6 +17220,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -16334,6 +17238,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16350,6 +17255,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16366,6 +17272,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16382,6 +17289,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16398,6 +17306,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16414,6 +17323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16479,9 +17389,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -16510,7 +17421,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16528,6 +17441,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -16544,6 +17458,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -16568,9 +17483,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16585,6 +17501,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16601,6 +17518,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16617,6 +17535,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16633,6 +17552,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16649,6 +17569,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16665,6 +17586,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16730,9 +17652,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -16761,7 +17684,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16779,6 +17704,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -16795,6 +17721,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -16819,9 +17746,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16836,6 +17764,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16852,6 +17781,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -16868,6 +17798,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16884,6 +17815,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -16900,6 +17832,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16916,6 +17849,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -16982,8 +17916,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17013,6 +17947,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17071,8 +18006,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17233,8 +18168,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17264,6 +18199,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17322,8 +18258,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17483,9 +18419,10 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17514,7 +18451,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17532,6 +18471,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -17548,6 +18488,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17572,9 +18513,10 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17589,6 +18531,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17605,6 +18548,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17621,6 +18565,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17637,6 +18582,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17653,6 +18599,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17669,6 +18616,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17734,9 +18682,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17765,7 +18714,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17783,6 +18734,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -17799,6 +18751,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17823,9 +18776,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17840,6 +18794,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17856,6 +18811,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17872,6 +18828,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17888,6 +18845,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17904,6 +18862,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17920,6 +18879,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17985,9 +18945,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18016,7 +18977,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18034,6 +18997,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18050,6 +19014,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18074,9 +19039,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18091,6 +19057,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18107,6 +19074,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18123,6 +19091,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18139,6 +19108,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18155,6 +19125,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18171,6 +19142,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18236,9 +19208,10 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18267,7 +19240,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18285,6 +19260,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18301,6 +19277,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18325,9 +19302,10 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18342,6 +19320,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18358,6 +19337,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18374,6 +19354,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18390,6 +19371,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18406,6 +19388,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18422,6 +19405,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18487,9 +19471,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18518,7 +19503,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18536,6 +19523,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18552,6 +19540,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18576,9 +19565,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18593,6 +19583,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18609,6 +19600,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18625,6 +19617,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18641,6 +19634,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18657,6 +19651,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18673,6 +19668,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18738,9 +19734,10 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18769,7 +19766,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18787,6 +19786,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -18803,6 +19803,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18827,9 +19828,10 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18844,6 +19846,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18860,6 +19863,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18876,6 +19880,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18892,6 +19897,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18908,6 +19914,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18924,6 +19931,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18989,9 +19997,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19020,7 +20029,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19038,6 +20049,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -19054,6 +20066,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19078,9 +20091,10 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19095,6 +20109,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19111,6 +20126,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19127,6 +20143,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19143,6 +20160,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19159,6 +20177,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19175,6 +20194,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19240,9 +20260,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19271,7 +20292,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19289,6 +20312,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] @@ -19305,6 +20329,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19329,9 +20354,10 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19346,6 +20372,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19362,6 +20389,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19378,6 +20406,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19394,6 +20423,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19410,6 +20440,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19426,6 +20457,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 69b0c7f93ab0e..4aba5ac566426 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -418,6 +418,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -609,6 +610,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1601,6 +1603,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_atomicrmw: @@ -1615,6 +1618,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acquire_atomicrmw: @@ -1625,6 +1629,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -1637,6 +1642,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_atomicrmw: @@ -1654,6 +1660,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: @@ -1664,6 +1671,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: @@ -1686,6 +1694,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: @@ -1708,6 +1717,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -1720,6 +1730,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acquire_atomicrmw: @@ -1938,6 +1949,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -1953,6 +1965,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -1965,6 +1978,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -1978,6 +1992,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -1996,6 +2011,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2007,6 +2023,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2031,6 +2048,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2056,6 +2074,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -2069,6 +2088,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2120,6 +2140,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2135,6 +2156,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2147,6 +2169,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -2160,6 +2183,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2178,6 +2202,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2189,6 +2214,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2213,6 +2239,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2238,6 +2265,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -2251,6 +2279,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2317,6 +2346,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2507,6 +2537,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2715,6 +2746,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3129,6 +3161,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: @@ -3157,6 +3190,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: @@ -3171,6 +3205,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3187,6 +3222,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg: @@ -3209,6 +3245,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: @@ -3223,6 +3260,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: @@ -3253,6 +3291,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: @@ -3283,6 +3322,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -3299,6 +3339,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: @@ -3596,6 +3637,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3625,6 +3667,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3641,6 +3684,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3658,6 +3702,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3681,6 +3726,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3696,6 +3742,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3728,6 +3775,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3761,6 +3809,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -3778,6 +3827,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3843,6 +3893,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3872,6 +3923,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3888,6 +3940,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3905,6 +3958,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3928,6 +3982,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3943,6 +3998,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3975,6 +4031,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -4008,6 +4065,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4025,6 +4083,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -4089,6 +4148,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: @@ -4117,6 +4177,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: @@ -4131,6 +4192,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4147,6 +4209,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_cmpxchg: @@ -4169,6 +4232,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: @@ -4183,6 +4247,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: @@ -4213,6 +4278,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: @@ -4243,6 +4309,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4259,6 +4326,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: @@ -4318,6 +4386,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: @@ -4346,6 +4415,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: @@ -4360,6 +4430,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4376,6 +4447,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg: @@ -4398,6 +4470,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: @@ -4412,6 +4485,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: @@ -4442,6 +4516,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: @@ -4472,6 +4547,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4488,6 +4564,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: @@ -4548,6 +4625,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4577,6 +4655,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4593,6 +4672,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4610,6 +4690,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4633,6 +4714,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4648,6 +4730,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4680,6 +4763,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4713,6 +4797,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4730,6 +4815,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4795,6 +4881,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4824,6 +4911,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4840,6 +4928,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4857,6 +4946,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4880,6 +4970,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4895,6 +4986,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4927,6 +5019,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4960,6 +5053,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4977,6 +5071,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -5042,6 +5137,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5071,6 +5167,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5087,6 +5184,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -5104,6 +5202,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5127,6 +5226,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5142,6 +5242,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5174,6 +5275,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5207,6 +5309,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -5224,6 +5327,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5289,6 +5393,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: @@ -5318,6 +5423,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: @@ -5334,6 +5440,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -5351,6 +5458,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: @@ -5374,6 +5482,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: @@ -5389,6 +5498,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: @@ -5421,6 +5531,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: @@ -5454,6 +5565,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -5471,6 +5583,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: @@ -5536,6 +5649,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: @@ -5565,6 +5679,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: @@ -5581,6 +5696,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -5598,6 +5714,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: @@ -5621,6 +5738,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: @@ -5636,6 +5754,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: @@ -5668,6 +5787,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: @@ -5701,6 +5821,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -5718,6 +5839,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: @@ -5783,6 +5905,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5812,6 +5935,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5828,6 +5952,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -5845,6 +5970,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5868,6 +5994,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5883,6 +6010,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5915,6 +6043,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5948,6 +6077,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -5965,6 +6095,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -6030,6 +6161,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6059,6 +6191,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6075,6 +6208,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -6092,6 +6226,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6115,6 +6250,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6130,6 +6266,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6162,6 +6299,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6195,6 +6333,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -6212,6 +6351,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6277,6 +6417,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6306,6 +6447,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6322,6 +6464,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -6339,6 +6482,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6362,6 +6506,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6377,6 +6522,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6409,6 +6555,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6442,6 +6589,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -6459,6 +6607,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6774,8 +6923,8 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -6805,6 +6954,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6864,8 +7014,8 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7300,8 +7450,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7332,6 +7482,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7395,8 +7546,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7576,8 +7727,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7608,6 +7759,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7671,8 +7823,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7851,8 +8003,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7882,6 +8034,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7941,8 +8094,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8109,8 +8262,8 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8140,6 +8293,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8199,8 +8353,8 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8366,8 +8520,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8398,6 +8552,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8461,8 +8616,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8642,8 +8797,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8674,6 +8829,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8737,8 +8893,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8918,8 +9074,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8950,6 +9106,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9013,8 +9170,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9194,8 +9351,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9226,6 +9383,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9289,8 +9447,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9470,8 +9628,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9502,6 +9660,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9565,8 +9724,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9744,8 +9903,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9776,6 +9935,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9839,8 +9999,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10020,8 +10180,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10052,6 +10212,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10115,8 +10276,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10296,8 +10457,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10328,6 +10489,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10391,8 +10553,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10955,6 +11117,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11128,6 +11291,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -11143,7 +11307,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11194,6 +11360,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -11632,6 +11799,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11646,6 +11814,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11668,6 +11837,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11686,6 +11856,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11696,6 +11867,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11717,6 +11889,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11750,6 +11923,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -11798,6 +11972,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11812,6 +11987,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11834,6 +12010,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11852,6 +12029,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11862,6 +12040,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11883,6 +12062,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11916,6 +12096,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -12118,6 +12299,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: @@ -12132,6 +12314,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: @@ -12142,6 +12325,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12154,6 +12338,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw: @@ -12171,6 +12356,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: @@ -12181,6 +12367,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: @@ -12203,6 +12390,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: @@ -12225,6 +12413,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12237,6 +12426,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: @@ -12281,6 +12471,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -12295,6 +12486,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12317,6 +12509,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -12334,6 +12527,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12344,6 +12538,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12365,6 +12560,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12398,6 +12594,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -12445,7 +12642,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12459,7 +12658,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12472,6 +12673,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12483,7 +12685,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12500,7 +12704,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12510,7 +12716,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12533,7 +12741,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12559,6 +12769,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12570,7 +12781,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12619,7 +12832,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12633,7 +12848,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12646,6 +12863,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12657,7 +12875,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12674,7 +12894,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12684,7 +12906,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12707,7 +12931,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12733,6 +12959,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12744,7 +12971,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12810,6 +13039,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12981,6 +13211,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12997,7 +13228,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13026,6 +13259,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -13045,6 +13279,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -13057,6 +13292,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -13083,6 +13319,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -13124,6 +13361,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13181,6 +13419,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -13197,7 +13436,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13226,6 +13467,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -13245,6 +13487,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -13257,6 +13500,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -13283,6 +13527,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -13324,6 +13569,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13606,6 +13852,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13634,6 +13881,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13648,6 +13896,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -13664,6 +13913,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13686,6 +13936,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13700,6 +13951,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13730,6 +13982,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13760,6 +14013,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -13776,6 +14030,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -13834,6 +14089,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13862,6 +14118,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13892,6 +14149,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13914,6 +14172,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13928,6 +14187,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13957,6 +14217,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14002,6 +14263,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -14063,7 +14325,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14091,7 +14355,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14108,6 +14374,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14123,7 +14390,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14145,7 +14414,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14159,7 +14430,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14190,7 +14463,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14224,6 +14499,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14239,7 +14515,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14302,7 +14580,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14330,7 +14610,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14347,6 +14629,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14362,7 +14645,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14384,7 +14669,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14398,7 +14685,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14429,7 +14718,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14463,6 +14754,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14478,7 +14770,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14542,6 +14836,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14570,6 +14865,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14584,6 +14880,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14600,6 +14897,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14622,6 +14920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14636,6 +14935,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14666,6 +14966,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14696,6 +14997,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14712,6 +15014,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -14771,6 +15074,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14799,6 +15103,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14813,6 +15118,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14829,6 +15135,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14851,6 +15158,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14865,6 +15173,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14895,6 +15204,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14925,6 +15235,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14941,6 +15252,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -14999,7 +15311,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15027,7 +15341,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15044,6 +15360,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15059,7 +15376,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15081,7 +15400,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15095,7 +15416,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15126,7 +15449,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15160,6 +15485,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15175,7 +15501,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15238,7 +15566,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15266,7 +15596,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15283,6 +15615,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15298,7 +15631,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15320,7 +15655,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15334,7 +15671,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15365,7 +15704,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15399,6 +15740,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15414,7 +15756,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15477,7 +15821,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15505,7 +15851,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15522,6 +15870,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15537,7 +15886,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15559,7 +15910,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15573,7 +15926,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15604,7 +15959,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15638,6 +15995,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15653,7 +16011,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15716,7 +16076,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15744,7 +16106,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15761,6 +16125,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15776,7 +16141,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15798,7 +16165,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15812,7 +16181,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15843,7 +16214,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15877,6 +16250,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15892,7 +16266,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -15955,7 +16331,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -15983,7 +16361,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16000,6 +16380,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16015,7 +16396,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16037,7 +16420,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16051,7 +16436,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16082,7 +16469,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16116,6 +16505,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16131,7 +16521,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -16194,7 +16586,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16222,7 +16616,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16239,6 +16635,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16254,7 +16651,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16276,7 +16675,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16290,7 +16691,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16321,7 +16724,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16355,6 +16760,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16370,7 +16776,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16433,7 +16841,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16461,7 +16871,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16478,6 +16890,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16493,7 +16906,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16515,7 +16930,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16529,7 +16946,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16560,7 +16979,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16594,6 +17015,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16609,7 +17031,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16672,7 +17096,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16700,7 +17126,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16717,6 +17145,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16732,7 +17161,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16754,7 +17185,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16768,7 +17201,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16799,7 +17234,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16833,6 +17270,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16848,7 +17286,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17163,8 +17603,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17194,6 +17634,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17253,8 +17694,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17418,6 +17859,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -17449,6 +17891,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17485,6 +17928,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17509,6 +17953,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -17526,6 +17971,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17559,6 +18005,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17610,6 +18057,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17679,9 +18127,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17710,7 +18159,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17747,6 +18198,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17771,9 +18223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17788,6 +18241,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17822,6 +18276,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17875,6 +18330,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17947,9 +18403,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17978,7 +18435,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18015,6 +18474,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18039,9 +18499,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18056,6 +18517,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18090,6 +18552,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18143,6 +18606,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18216,8 +18680,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18247,6 +18711,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18306,8 +18771,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18474,8 +18939,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18505,6 +18970,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18564,8 +19030,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18729,9 +19195,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18760,7 +19227,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18797,6 +19266,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18821,9 +19291,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18838,6 +19309,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18872,6 +19344,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18925,6 +19398,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18997,9 +19471,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19028,7 +19503,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19065,6 +19542,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19089,9 +19567,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19106,6 +19585,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19140,6 +19620,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19193,6 +19674,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19265,9 +19747,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19296,7 +19779,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19333,6 +19818,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19357,9 +19843,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19374,6 +19861,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19408,6 +19896,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19461,6 +19950,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19533,9 +20023,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19564,7 +20055,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19601,6 +20094,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19625,9 +20119,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19642,6 +20137,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19676,6 +20172,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19729,6 +20226,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19801,9 +20299,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19832,7 +20331,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19869,6 +20370,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19893,9 +20395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19910,6 +20413,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19944,6 +20448,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19997,6 +20502,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20067,9 +20573,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -20098,7 +20605,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20135,6 +20644,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20159,9 +20669,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20176,6 +20687,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -20210,6 +20722,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -20263,6 +20776,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20335,9 +20849,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -20366,7 +20881,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20403,6 +20920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20427,9 +20945,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20444,6 +20963,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -20478,6 +20998,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -20531,6 +21052,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20603,9 +21125,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -20634,7 +21157,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20671,6 +21196,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20695,9 +21221,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20712,6 +21239,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -20746,6 +21274,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -20799,6 +21328,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 0467c5047a0be..97b7633c9d099 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -444,6 +444,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -469,6 +470,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -628,6 +630,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -655,6 +658,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,6 +1545,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1563,6 +1568,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1863,6 +1869,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1887,6 +1894,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2037,6 +2045,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2061,6 +2070,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2218,6 +2228,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2245,6 +2256,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2414,6 +2426,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,6 +2456,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,6 +2634,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,6 +2664,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,6 +3000,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3010,6 +3027,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3360,6 +3378,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3388,6 +3407,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3561,6 +3581,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3589,6 +3610,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3754,6 +3776,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3780,6 +3803,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3937,6 +3961,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3963,6 +3988,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4128,6 +4154,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4156,6 +4183,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4329,6 +4357,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4357,6 +4386,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4530,6 +4560,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4558,6 +4589,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4731,6 +4763,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4759,6 +4792,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4932,6 +4966,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4960,6 +4995,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5133,6 +5169,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5161,6 +5198,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5334,6 +5372,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5362,6 +5401,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5535,6 +5575,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5563,6 +5604,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5954,6 +5996,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5985,6 +6028,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6407,6 +6451,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6440,6 +6485,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6641,6 +6687,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6674,6 +6721,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6867,6 +6915,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6898,6 +6947,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7083,6 +7133,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7114,6 +7165,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7307,6 +7359,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +7393,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,6 +7595,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +7629,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7775,6 +7831,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7808,6 +7865,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8009,6 +8067,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8042,6 +8101,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8243,6 +8303,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8276,6 +8337,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8477,6 +8539,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8510,6 +8573,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8711,6 +8775,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8744,6 +8809,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8945,6 +9011,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8978,6 +9045,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9415,6 +9483,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9498,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9442,6 +9512,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9454,6 +9525,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9467,6 +9539,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9553,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9492,6 +9566,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9579,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9516,6 +9592,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9528,6 +9605,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9540,6 +9618,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -9585,7 +9664,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9680,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,7 +9695,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9624,7 +9709,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9637,7 +9724,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9739,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9662,7 +9753,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9767,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9686,7 +9781,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9698,7 +9795,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9710,7 +9809,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10036,6 +10137,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10149,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10057,6 +10160,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10067,6 +10171,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10078,6 +10183,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10194,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10098,6 +10205,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10216,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10118,6 +10227,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10128,6 +10238,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10138,6 +10249,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10176,6 +10288,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10300,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10197,6 +10311,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10207,6 +10322,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10218,6 +10334,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10345,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10238,6 +10356,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10367,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10258,6 +10378,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10268,6 +10389,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10278,6 +10400,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10457,6 +10580,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10468,6 +10592,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10478,6 +10603,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10488,6 +10614,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10499,6 +10626,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10509,6 +10637,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10519,6 +10648,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10529,6 +10659,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10539,6 +10670,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10549,6 +10681,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10559,6 +10692,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: @@ -10596,6 +10730,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10742,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10617,6 +10753,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10627,6 +10764,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10638,6 +10776,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10787,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10658,6 +10798,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10809,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10678,6 +10820,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10688,6 +10831,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10698,6 +10842,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10736,7 +10881,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10747,7 +10894,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10757,7 +10906,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10767,7 +10918,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10778,7 +10931,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10788,7 +10943,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10798,7 +10955,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10808,7 +10967,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10818,7 +10979,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10828,7 +10991,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10838,7 +11003,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10876,7 +11043,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10887,7 +11056,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10897,7 +11068,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10907,7 +11080,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10918,7 +11093,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10928,7 +11105,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10938,7 +11117,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10948,7 +11129,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10958,7 +11141,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10968,7 +11153,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10978,7 +11165,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -11017,6 +11206,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11222,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11046,6 +11237,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11059,6 +11251,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11073,6 +11266,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11281,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11100,6 +11295,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11309,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11126,6 +11323,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11139,6 +11337,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11152,6 +11351,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11199,7 +11399,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11416,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11228,7 +11432,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11241,7 +11447,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11255,7 +11463,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11479,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11282,7 +11494,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11509,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11308,7 +11524,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11321,7 +11539,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11334,7 +11554,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11382,7 +11604,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11621,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11411,7 +11637,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11424,7 +11652,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11438,7 +11668,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11684,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11465,7 +11699,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11714,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11491,7 +11729,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11504,7 +11744,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11517,7 +11759,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11735,6 +11979,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11993,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11760,6 +12006,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11772,6 +12019,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +12033,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +12046,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11809,6 +12059,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +12072,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11833,6 +12085,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11845,6 +12098,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11857,6 +12111,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +12156,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12170,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11926,6 +12183,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11938,6 +12196,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11951,6 +12210,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12223,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11975,6 +12236,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12249,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11999,6 +12262,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12011,6 +12275,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12023,6 +12288,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -12068,7 +12334,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12349,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12093,7 +12363,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12105,7 +12377,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12392,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12406,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12142,7 +12420,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12434,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12166,7 +12448,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12178,7 +12462,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12190,7 +12476,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12523,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12538,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12260,7 +12552,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12272,7 +12566,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12581,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12595,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12309,7 +12609,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12623,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12333,7 +12637,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12345,7 +12651,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12357,7 +12665,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12713,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12727,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12428,6 +12740,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12440,6 +12753,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12767,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12780,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12477,6 +12793,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12806,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12501,6 +12819,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12513,6 +12832,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12525,6 +12845,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12891,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12905,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12595,6 +12918,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12607,6 +12931,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12945,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12958,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12644,6 +12971,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12984,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12668,6 +12997,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12680,6 +13010,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12692,6 +13023,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +13068,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12749,7 +13083,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12761,7 +13097,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12773,7 +13111,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12786,7 +13126,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12798,7 +13140,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12810,7 +13154,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12822,7 +13168,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12834,7 +13182,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12846,7 +13196,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12858,7 +13210,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12903,7 +13257,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13272,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12928,7 +13286,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12940,7 +13300,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13315,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13329,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12977,7 +13343,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13357,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13001,7 +13371,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13013,7 +13385,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13025,7 +13399,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13446,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13461,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13095,7 +13475,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13107,7 +13489,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13504,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13518,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13144,7 +13532,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13546,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13168,7 +13560,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13180,7 +13574,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13192,7 +13588,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13635,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13650,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13262,7 +13664,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13274,7 +13678,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13693,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13707,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13311,7 +13721,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13735,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13335,7 +13749,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13347,7 +13763,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13359,7 +13777,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13824,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13839,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13429,7 +13853,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13441,7 +13867,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13882,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13896,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13478,7 +13910,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13924,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13502,7 +13938,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13514,7 +13952,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13526,7 +13966,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +14013,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +14028,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13596,7 +14042,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13608,7 +14056,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +14071,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +14085,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13645,7 +14099,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +14113,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13669,7 +14127,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13681,7 +14141,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13693,7 +14155,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +14202,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +14217,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13763,7 +14231,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13775,7 +14245,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +14260,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +14274,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13812,7 +14288,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14302,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13836,7 +14316,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13848,7 +14330,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13860,7 +14344,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14391,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14406,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13930,7 +14420,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13942,7 +14434,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14449,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14463,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13979,7 +14477,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14491,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14003,7 +14505,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14015,7 +14519,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14027,7 +14533,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14792,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14810,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14317,6 +14827,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14332,6 +14843,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14348,6 +14860,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14877,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14379,6 +14893,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14909,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14409,6 +14925,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14424,6 +14941,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14439,6 +14957,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14494,6 +15013,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +15031,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14527,6 +15048,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14542,6 +15064,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14558,6 +15081,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +15098,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14589,6 +15114,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +15130,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14619,6 +15146,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14634,6 +15162,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14649,6 +15178,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +15235,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +15254,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14738,7 +15272,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14753,7 +15289,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14769,7 +15307,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15325,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14800,7 +15342,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15359,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14830,7 +15376,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14845,7 +15393,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14860,7 +15410,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14916,7 +15468,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15487,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14949,7 +15505,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14964,7 +15522,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14980,7 +15540,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15558,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15011,7 +15575,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15592,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15041,7 +15609,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15056,7 +15626,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15071,7 +15643,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15128,6 +15702,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15720,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15161,6 +15737,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15176,6 +15753,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15192,6 +15770,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15787,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15223,6 +15803,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15819,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15253,6 +15835,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15268,6 +15851,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15283,6 +15867,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15339,6 +15924,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15942,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15372,6 +15959,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15387,6 +15975,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15403,6 +15992,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +16009,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15434,6 +16025,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +16041,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15464,6 +16057,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15479,6 +16073,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15494,6 +16089,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15549,7 +16145,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +16164,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15582,7 +16182,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15597,7 +16199,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15613,7 +16217,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +16235,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15644,7 +16252,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +16269,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15674,7 +16286,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15689,7 +16303,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15704,7 +16320,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15760,7 +16378,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16397,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15793,7 +16415,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15808,7 +16432,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15824,7 +16450,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16468,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15855,7 +16485,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16502,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15885,7 +16519,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15900,7 +16536,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15915,7 +16553,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15971,7 +16611,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16630,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,7 +16648,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16019,7 +16665,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16035,7 +16683,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16701,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16066,7 +16718,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16735,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16096,7 +16752,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16111,7 +16769,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16126,7 +16786,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16182,7 +16844,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16863,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16215,7 +16881,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16230,7 +16898,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16246,7 +16916,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16934,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16277,7 +16951,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16968,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16307,7 +16985,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16322,7 +17002,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16337,7 +17019,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16393,7 +17077,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +17096,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16426,7 +17114,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16441,7 +17131,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16457,7 +17149,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +17167,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16488,7 +17184,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +17201,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16518,7 +17218,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16533,7 +17235,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16548,7 +17252,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16604,7 +17310,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +17329,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,7 +17347,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16652,7 +17364,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16668,7 +17382,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +17400,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16699,7 +17417,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17434,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16729,7 +17451,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16744,7 +17468,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16759,7 +17485,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16815,7 +17543,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17562,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16848,7 +17580,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16863,7 +17597,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16879,7 +17615,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17633,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16910,7 +17650,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17667,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16940,7 +17684,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16955,7 +17701,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16970,7 +17718,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -17026,7 +17776,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17795,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17059,7 +17813,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -17074,7 +17830,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -17090,7 +17848,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17866,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17121,7 +17883,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17900,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17151,7 +17917,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17166,7 +17934,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -17181,7 +17951,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 78209ee34cad4..b6097541c261a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -845,6 +845,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index f84d451f8ecb0..11df60a1a051a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -366,6 +366,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +381,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -393,6 +395,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -405,6 +408,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -418,6 +422,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -431,6 +436,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -443,6 +449,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -455,6 +462,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -467,6 +475,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -479,6 +488,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -491,6 +501,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -536,7 +547,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -550,7 +563,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -563,7 +578,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -575,7 +592,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -588,7 +607,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -601,7 +622,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -613,7 +636,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -625,7 +650,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -637,7 +664,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -649,7 +678,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -661,7 +692,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -987,6 +1020,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -998,6 +1032,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1008,6 +1043,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1018,6 +1054,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1029,6 +1066,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1039,6 +1077,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1049,6 +1088,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1059,6 +1099,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1069,6 +1110,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1079,6 +1121,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1089,6 +1132,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1127,6 +1171,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1138,6 +1183,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1148,6 +1194,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1158,6 +1205,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1169,6 +1217,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1179,6 +1228,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1189,6 +1239,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1250,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1209,6 +1261,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1219,6 +1272,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1229,6 +1283,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1408,6 +1463,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_atomicrmw: @@ -1419,6 +1475,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw: @@ -1429,6 +1486,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acquire_atomicrmw: @@ -1439,6 +1497,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_atomicrmw: @@ -1450,6 +1509,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1460,6 +1520,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1470,6 +1531,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1480,6 +1542,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: @@ -1490,6 +1553,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acquire_atomicrmw: @@ -1500,6 +1564,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_atomicrmw: @@ -1510,6 +1575,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_atomicrmw: @@ -1547,6 +1613,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1558,6 +1625,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1568,6 +1636,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1578,6 +1647,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1589,6 +1659,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1599,6 +1670,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1609,6 +1681,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1619,6 +1692,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1629,6 +1703,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1639,6 +1714,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1649,6 +1725,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1687,7 +1764,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1698,7 +1777,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1708,7 +1789,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1718,7 +1801,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1729,7 +1814,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1739,7 +1826,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1749,7 +1838,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1759,7 +1850,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1769,7 +1862,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1779,7 +1874,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1789,7 +1886,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: @@ -1827,7 +1926,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1838,7 +1939,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1848,7 +1951,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1858,7 +1963,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1869,7 +1976,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1879,7 +1988,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1889,7 +2000,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1899,7 +2012,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1909,7 +2024,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1919,7 +2036,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1929,7 +2048,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: @@ -1968,6 +2089,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2105,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1997,6 +2120,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -2010,6 +2134,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -2024,6 +2149,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,6 +2164,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2051,6 +2178,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2064,6 +2192,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2077,6 +2206,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2090,6 +2220,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -2103,6 +2234,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2150,7 +2282,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2165,7 +2299,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2179,7 +2315,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -2192,7 +2330,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -2206,7 +2346,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,7 +2362,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2233,7 +2377,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2246,7 +2392,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2259,7 +2407,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2272,7 +2422,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -2285,7 +2437,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2333,7 +2487,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2348,7 +2504,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2362,7 +2520,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -2375,7 +2535,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -2389,7 +2551,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2403,7 +2567,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2416,7 +2582,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2429,7 +2597,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2442,7 +2612,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2455,7 +2627,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -2468,7 +2642,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2686,6 +2862,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2699,6 +2876,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2711,6 +2889,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2723,6 +2902,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2736,6 +2916,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2748,6 +2929,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2760,6 +2942,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2772,6 +2955,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2784,6 +2968,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2796,6 +2981,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2808,6 +2994,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: @@ -2852,6 +3039,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2865,6 +3053,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -2877,6 +3066,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2889,6 +3079,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -2902,6 +3093,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2914,6 +3106,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2926,6 +3119,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2938,6 +3132,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2950,6 +3145,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2962,6 +3158,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -2974,6 +3171,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3019,7 +3217,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3032,7 +3232,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3044,7 +3246,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3056,7 +3260,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3069,7 +3275,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3081,7 +3289,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3093,7 +3303,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3105,7 +3317,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3117,7 +3331,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3129,7 +3345,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3141,7 +3359,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: @@ -3186,7 +3406,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3199,7 +3421,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3211,7 +3435,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3223,7 +3449,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3236,7 +3464,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3248,7 +3478,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3260,7 +3492,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3272,7 +3506,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3284,7 +3520,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3296,7 +3534,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3308,7 +3548,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: @@ -3354,6 +3596,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3367,6 +3610,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3379,6 +3623,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3391,6 +3636,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3404,6 +3650,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3416,6 +3663,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3428,6 +3676,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3440,6 +3689,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3452,6 +3702,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3464,6 +3715,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3476,6 +3728,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: @@ -3521,6 +3774,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3534,6 +3788,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3546,6 +3801,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3558,6 +3814,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3571,6 +3828,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3583,6 +3841,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3595,6 +3854,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3607,6 +3867,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3619,6 +3880,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3631,6 +3893,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3643,6 +3906,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: @@ -3687,7 +3951,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3700,7 +3966,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3712,7 +3980,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3724,7 +3994,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3737,7 +4009,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3749,7 +4023,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3761,7 +4037,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3773,7 +4051,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3785,7 +4065,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3797,7 +4079,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3809,7 +4093,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: @@ -3854,7 +4140,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3867,7 +4155,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3879,7 +4169,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3891,7 +4183,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3904,7 +4198,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3916,7 +4212,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3928,7 +4226,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3940,7 +4240,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3952,7 +4254,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3964,7 +4268,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -3976,7 +4282,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: @@ -4021,7 +4329,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4034,7 +4344,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4046,7 +4358,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4058,7 +4372,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4071,7 +4387,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4083,7 +4401,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4095,7 +4415,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4107,7 +4429,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4119,7 +4443,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4131,7 +4457,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4143,7 +4471,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: @@ -4188,7 +4518,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4201,7 +4533,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4213,7 +4547,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4225,7 +4561,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4238,7 +4576,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4250,7 +4590,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4262,7 +4604,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4274,7 +4618,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4286,7 +4632,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4298,7 +4646,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4310,7 +4660,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: @@ -4355,7 +4707,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4368,7 +4722,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4380,7 +4736,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4392,7 +4750,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4405,7 +4765,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4417,7 +4779,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4429,7 +4793,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4441,7 +4807,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4453,7 +4821,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4465,7 +4835,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4477,7 +4849,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: @@ -4522,7 +4896,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4535,7 +4911,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4547,7 +4925,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4559,7 +4939,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4572,7 +4954,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4584,7 +4968,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4596,7 +4982,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4608,7 +4996,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4620,7 +5010,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4632,7 +5024,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4644,7 +5038,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: @@ -4689,7 +5085,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4702,7 +5100,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4714,7 +5114,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4726,7 +5128,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4739,7 +5143,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4751,7 +5157,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4763,7 +5171,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4775,7 +5185,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4787,7 +5199,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4799,7 +5213,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4811,7 +5227,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: @@ -4856,7 +5274,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4869,7 +5289,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4881,7 +5303,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4893,7 +5317,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4906,7 +5332,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4918,7 +5346,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4930,7 +5360,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4942,7 +5374,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4954,7 +5388,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4966,7 +5402,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4978,7 +5416,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: @@ -5235,6 +5675,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5252,6 +5693,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5268,6 +5710,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -5283,6 +5726,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -5299,6 +5743,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,6 +5760,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5330,6 +5776,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5345,6 +5792,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5360,6 +5808,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5375,6 +5824,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -5390,6 +5840,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -5445,6 +5896,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5462,6 +5914,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5478,6 +5931,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5493,6 +5947,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5509,6 +5964,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5525,6 +5981,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5540,6 +5997,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5555,6 +6013,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5570,6 +6029,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5585,6 +6045,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,6 +6061,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5656,7 +6118,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5673,7 +6137,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5689,7 +6155,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -5704,7 +6172,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -5720,7 +6190,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5736,7 +6208,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5751,7 +6225,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5766,7 +6242,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5781,7 +6259,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5796,7 +6276,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -5811,7 +6293,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -5867,7 +6351,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5884,7 +6370,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5900,7 +6388,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -5915,7 +6405,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -5931,7 +6423,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5947,7 +6441,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5962,7 +6458,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5977,7 +6475,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5992,7 +6492,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6007,7 +6509,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6022,7 +6526,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6079,6 +6585,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6096,6 +6603,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6112,6 +6620,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6127,6 +6636,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6143,6 +6653,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6159,6 +6670,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6174,6 +6686,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6189,6 +6702,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6204,6 +6718,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6219,6 +6734,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6234,6 +6750,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6290,6 +6807,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,6 +6825,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6323,6 +6842,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6338,6 +6858,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6354,6 +6875,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6370,6 +6892,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6385,6 +6908,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6400,6 +6924,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6415,6 +6940,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6430,6 +6956,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6445,6 +6972,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6500,7 +7028,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,7 +7047,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6533,7 +7065,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6548,7 +7082,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6564,7 +7100,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6580,7 +7118,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6595,7 +7135,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6610,7 +7152,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6625,7 +7169,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6640,7 +7186,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6655,7 +7203,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6711,7 +7261,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6728,7 +7280,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6744,7 +7298,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6759,7 +7315,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6775,7 +7333,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6791,7 +7351,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6806,7 +7368,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6821,7 +7385,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +7402,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6851,7 +7419,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6866,7 +7436,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6922,7 +7494,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6939,7 +7513,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6955,7 +7531,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6970,7 +7548,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6986,7 +7566,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7002,7 +7584,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7017,7 +7601,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7032,7 +7618,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7047,7 +7635,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7062,7 +7652,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7077,7 +7669,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7133,7 +7727,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,7 +7746,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7166,7 +7764,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7181,7 +7781,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7197,7 +7799,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7213,7 +7817,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7228,7 +7834,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7243,7 +7851,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7258,7 +7868,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7273,7 +7885,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7288,7 +7902,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7344,7 +7960,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7361,7 +7979,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7377,7 +7997,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7392,7 +8014,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7408,7 +8032,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7424,7 +8050,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7439,7 +8067,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7454,7 +8084,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7469,7 +8101,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7484,7 +8118,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7499,7 +8135,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7555,7 +8193,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7572,7 +8212,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7588,7 +8230,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7603,7 +8247,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7619,7 +8265,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7635,7 +8283,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7650,7 +8300,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7665,7 +8317,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7680,7 +8334,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7695,7 +8351,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7710,7 +8368,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7766,7 +8426,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7783,7 +8445,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7799,7 +8463,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7814,7 +8480,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7830,7 +8498,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,7 +8516,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7861,7 +8533,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7876,7 +8550,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7891,7 +8567,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7906,7 +8584,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7921,7 +8601,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7977,7 +8659,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,7 +8678,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8010,7 +8696,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -8025,7 +8713,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -8041,7 +8731,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8057,7 +8749,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8072,7 +8766,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8087,7 +8783,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8102,7 +8800,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8117,7 +8817,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8132,7 +8834,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8529,6 +9233,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8543,6 +9248,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8556,6 +9262,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -8568,6 +9275,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -8581,6 +9289,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8594,6 +9303,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8606,6 +9316,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8618,6 +9329,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8630,6 +9342,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8642,6 +9355,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8654,6 +9368,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8699,7 +9414,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8713,7 +9430,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8726,7 +9445,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -8738,7 +9459,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -8751,7 +9474,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8764,7 +9489,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8776,7 +9503,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8788,7 +9517,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8800,7 +9531,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8812,7 +9545,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8824,7 +9559,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -9150,6 +9887,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9161,6 +9899,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9171,6 +9910,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -9181,6 +9921,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -9192,6 +9933,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9202,6 +9944,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9212,6 +9955,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -9222,6 +9966,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9232,6 +9977,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -9242,6 +9988,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -9252,6 +9999,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -9290,6 +10038,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9301,6 +10050,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9311,6 +10061,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -9321,6 +10072,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -9332,6 +10084,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9342,6 +10095,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9352,6 +10106,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -9362,6 +10117,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9372,6 +10128,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -9382,6 +10139,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -9392,6 +10150,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -9571,6 +10330,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9582,6 +10342,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9592,6 +10353,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9602,6 +10364,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9613,6 +10376,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9623,6 +10387,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9633,6 +10398,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9643,6 +10409,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9653,6 +10420,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9663,6 +10431,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9673,6 +10442,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: @@ -9710,6 +10480,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9721,6 +10492,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9731,6 +10503,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -9741,6 +10514,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -9752,6 +10526,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9762,6 +10537,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9772,6 +10548,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -9782,6 +10559,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9792,6 +10570,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -9802,6 +10581,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -9812,6 +10592,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -9850,7 +10631,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9861,7 +10644,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9871,7 +10656,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9881,7 +10668,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9892,7 +10681,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9902,7 +10693,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9912,7 +10705,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9922,7 +10717,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9932,7 +10729,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9942,7 +10741,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9952,7 +10753,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: @@ -9990,7 +10793,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10001,7 +10806,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10011,7 +10818,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10021,7 +10830,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10032,7 +10843,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10042,7 +10855,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10052,7 +10867,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10062,7 +10879,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10072,7 +10891,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10082,7 +10903,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10092,7 +10915,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: @@ -10131,6 +10956,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +10972,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10160,6 +10987,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -10173,6 +11001,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -10187,6 +11016,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10201,6 +11031,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10214,6 +11045,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10227,6 +11059,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10240,6 +11073,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10253,6 +11087,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -10266,6 +11101,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10313,7 +11149,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10328,7 +11166,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10342,7 +11182,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -10355,7 +11197,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -10369,7 +11213,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,7 +11229,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10396,7 +11244,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10409,7 +11259,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10422,7 +11274,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10435,7 +11289,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -10448,7 +11304,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10496,7 +11354,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10511,7 +11371,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10525,7 +11387,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -10538,7 +11402,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -10552,7 +11418,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,7 +11434,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10579,7 +11449,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10592,7 +11464,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10605,7 +11479,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10618,7 +11494,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -10631,7 +11509,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10849,6 +11729,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10862,6 +11743,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10874,6 +11756,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10886,6 +11769,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10899,6 +11783,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10911,6 +11796,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10923,6 +11809,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10935,6 +11822,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10947,6 +11835,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10959,6 +11848,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -10971,6 +11861,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -11015,6 +11906,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11028,6 +11920,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11040,6 +11933,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11052,6 +11946,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11065,6 +11960,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11077,6 +11973,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11089,6 +11986,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11101,6 +11999,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11113,6 +12012,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11125,6 +12025,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -11137,6 +12038,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -11182,7 +12084,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11195,7 +12099,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11207,7 +12113,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11219,7 +12127,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11232,7 +12142,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11244,7 +12156,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11256,7 +12170,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11268,7 +12184,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11280,7 +12198,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11292,7 +12212,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11304,7 +12226,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -11349,7 +12273,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11362,7 +12288,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11374,7 +12302,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11386,7 +12316,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11399,7 +12331,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11411,7 +12345,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11423,7 +12359,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11435,7 +12373,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11447,7 +12387,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11459,7 +12401,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11471,7 +12415,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -11517,6 +12463,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11530,6 +12477,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11542,6 +12490,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11554,6 +12503,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11567,6 +12517,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11579,6 +12530,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11591,6 +12543,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11603,6 +12556,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11615,6 +12569,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11627,6 +12582,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11639,6 +12595,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -11684,6 +12641,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11697,6 +12655,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11709,6 +12668,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11721,6 +12681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11734,6 +12695,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11746,6 +12708,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11758,6 +12721,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11770,6 +12734,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11782,6 +12747,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11794,6 +12760,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11806,6 +12773,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: @@ -11850,7 +12818,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11863,7 +12833,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11875,7 +12847,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11887,7 +12861,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11900,7 +12876,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11912,7 +12890,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11924,7 +12904,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11936,7 +12918,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11948,7 +12932,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11960,7 +12946,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -11972,7 +12960,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: @@ -12017,7 +13007,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12030,7 +13022,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12042,7 +13036,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12054,7 +13050,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12067,7 +13065,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12079,7 +13079,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12091,7 +13093,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12103,7 +13107,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12115,7 +13121,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12127,7 +13135,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12139,7 +13149,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -12184,7 +13196,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12197,7 +13211,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12209,7 +13225,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12221,7 +13239,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12234,7 +13254,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12246,7 +13268,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12258,7 +13282,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12270,7 +13296,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12282,7 +13310,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12294,7 +13324,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12306,7 +13338,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -12351,7 +13385,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12364,7 +13400,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12376,7 +13414,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12388,7 +13428,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12401,7 +13443,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12413,7 +13457,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12425,7 +13471,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12437,7 +13485,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12449,7 +13499,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12461,7 +13513,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12473,7 +13527,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -12518,7 +13574,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12531,7 +13589,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12543,7 +13603,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12555,7 +13617,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12568,7 +13632,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12580,7 +13646,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12592,7 +13660,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12604,7 +13674,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12616,7 +13688,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12628,7 +13702,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12640,7 +13716,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -12685,7 +13763,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12698,7 +13778,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12710,7 +13792,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12722,7 +13806,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12735,7 +13821,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12747,7 +13835,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12759,7 +13849,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12771,7 +13863,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12783,7 +13877,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12795,7 +13891,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12807,7 +13905,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: @@ -12852,7 +13952,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12865,7 +13967,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12877,7 +13981,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12889,7 +13995,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12902,7 +14010,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12914,7 +14024,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12926,7 +14038,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12938,7 +14052,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12950,7 +14066,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12962,7 +14080,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -12974,7 +14094,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -13019,7 +14141,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13032,7 +14156,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13044,7 +14170,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13056,7 +14184,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13069,7 +14199,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13081,7 +14213,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13093,7 +14227,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13105,7 +14241,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13117,7 +14255,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13129,7 +14269,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13141,7 +14283,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -13398,6 +14542,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13415,6 +14560,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13431,6 +14577,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -13446,6 +14593,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -13462,6 +14610,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13478,6 +14627,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13493,6 +14643,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13508,6 +14659,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13523,6 +14675,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13538,6 +14691,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -13553,6 +14707,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -13608,6 +14763,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13625,6 +14781,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13641,6 +14798,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13656,6 +14814,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13672,6 +14831,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13688,6 +14848,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13703,6 +14864,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13718,6 +14880,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13733,6 +14896,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13748,6 +14912,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13763,6 +14928,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13819,7 +14985,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13836,7 +15004,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13852,7 +15022,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -13867,7 +15039,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -13883,7 +15057,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13899,7 +15075,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13914,7 +15092,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13929,7 +15109,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13944,7 +15126,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13959,7 +15143,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -13974,7 +15160,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14030,7 +15218,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14047,7 +15237,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14063,7 +15255,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14078,7 +15272,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14094,7 +15290,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14110,7 +15308,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14125,7 +15325,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14140,7 +15342,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14155,7 +15359,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14170,7 +15376,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14185,7 +15393,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14242,6 +15452,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14259,6 +15470,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14275,6 +15487,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14290,6 +15503,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14306,6 +15520,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +15537,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14337,6 +15553,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14352,6 +15569,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14367,6 +15585,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14382,6 +15601,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14397,6 +15617,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14453,6 +15674,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14470,6 +15692,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14486,6 +15709,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14501,6 +15725,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14517,6 +15742,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14533,6 +15759,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14548,6 +15775,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14563,6 +15791,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14578,6 +15807,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14593,6 +15823,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14608,6 +15839,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14663,7 +15895,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14680,7 +15914,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14696,7 +15932,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14711,7 +15949,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14727,7 +15967,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14743,7 +15985,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14758,7 +16002,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14773,7 +16019,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14788,7 +16036,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14803,7 +16053,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14818,7 +16070,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14874,7 +16128,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14891,7 +16147,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14907,7 +16165,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14922,7 +16182,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14938,7 +16200,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,7 +16218,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14969,7 +16235,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14984,7 +16252,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14999,7 +16269,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15014,7 +16286,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15029,7 +16303,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15085,7 +16361,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15102,7 +16380,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15118,7 +16398,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15133,7 +16415,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15149,7 +16433,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,7 +16451,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15180,7 +16468,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15195,7 +16485,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15210,7 +16502,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15225,7 +16519,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15240,7 +16536,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15296,7 +16594,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15313,7 +16613,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15329,7 +16631,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15344,7 +16648,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15360,7 +16666,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15376,7 +16684,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15391,7 +16701,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15406,7 +16718,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15421,7 +16735,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15436,7 +16752,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15451,7 +16769,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15507,7 +16827,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15524,7 +16846,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15540,7 +16864,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15555,7 +16881,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15571,7 +16899,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15587,7 +16917,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15602,7 +16934,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15617,7 +16951,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15632,7 +16968,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15647,7 +16985,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15662,7 +17002,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15718,7 +17060,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15735,7 +17079,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15751,7 +17097,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15766,7 +17114,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15782,7 +17132,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15798,7 +17150,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15813,7 +17167,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15828,7 +17184,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15843,7 +17201,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15858,7 +17218,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15873,7 +17235,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15929,7 +17293,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15946,7 +17312,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15962,7 +17330,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15977,7 +17347,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15993,7 +17365,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16009,7 +17383,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16024,7 +17400,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16039,7 +17417,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16054,7 +17434,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16069,7 +17451,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16084,7 +17468,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16140,7 +17526,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16157,7 +17545,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16173,7 +17563,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16188,7 +17580,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16204,7 +17598,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16220,7 +17616,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16235,7 +17633,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16250,7 +17650,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16265,7 +17667,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16280,7 +17684,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16295,7 +17701,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 74a297241d851..2a21d2013ceca 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -444,6 +444,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -469,6 +470,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -628,6 +630,7 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -655,6 +658,7 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,6 +1545,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1563,6 +1568,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1863,6 +1869,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1887,6 +1894,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2037,6 +2045,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2061,6 +2070,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2218,6 +2228,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2245,6 +2256,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2414,6 +2426,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,6 +2456,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,6 +2634,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,6 +2664,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,6 +3000,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3010,6 +3027,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3360,6 +3378,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3388,6 +3407,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3561,6 +3581,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3589,6 +3610,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3754,6 +3776,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3780,6 +3803,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3937,6 +3961,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3963,6 +3988,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4128,6 +4154,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4156,6 +4183,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4329,6 +4357,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4357,6 +4386,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4530,6 +4560,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4558,6 +4589,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4731,6 +4763,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4759,6 +4792,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4932,6 +4966,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4960,6 +4995,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5133,6 +5169,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5161,6 +5198,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5334,6 +5372,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5362,6 +5401,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5535,6 +5575,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5563,6 +5604,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5954,6 +5996,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5985,6 +6028,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6407,6 +6451,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6440,6 +6485,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6641,6 +6687,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6674,6 +6721,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6867,6 +6915,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6898,6 +6947,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7083,6 +7133,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7114,6 +7165,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7307,6 +7359,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +7393,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,6 +7595,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +7629,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7775,6 +7831,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7808,6 +7865,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8009,6 +8067,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8042,6 +8101,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8243,6 +8303,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8276,6 +8337,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8477,6 +8539,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8510,6 +8573,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8711,6 +8775,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8744,6 +8809,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8945,6 +9011,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8978,6 +9045,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9415,6 +9483,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9498,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9442,6 +9512,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9454,6 +9525,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9467,6 +9539,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9553,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9492,6 +9566,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9579,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9516,6 +9592,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9528,6 +9605,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9540,6 +9618,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -9585,7 +9664,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9680,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,7 +9695,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9624,7 +9709,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9637,7 +9724,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9739,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9662,7 +9753,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9767,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9686,7 +9781,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9698,7 +9795,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9710,7 +9809,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10036,6 +10137,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10149,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10057,6 +10160,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10067,6 +10171,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10078,6 +10183,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10194,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10098,6 +10205,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10216,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10118,6 +10227,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10128,6 +10238,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10138,6 +10249,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10176,6 +10288,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10300,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10197,6 +10311,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10207,6 +10322,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10218,6 +10334,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10345,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10238,6 +10356,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10367,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10258,6 +10378,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10268,6 +10389,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10278,6 +10400,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10457,6 +10580,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10468,6 +10592,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10478,6 +10603,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10488,6 +10614,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10499,6 +10626,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10509,6 +10637,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10519,6 +10648,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10529,6 +10659,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10539,6 +10670,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10549,6 +10681,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10559,6 +10692,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_atomicrmw: @@ -10596,6 +10730,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10742,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10617,6 +10753,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10627,6 +10764,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10638,6 +10776,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10787,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10658,6 +10798,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10809,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10678,6 +10820,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10688,6 +10831,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10698,6 +10842,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10736,7 +10881,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10747,7 +10894,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10757,7 +10906,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10767,7 +10918,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10778,7 +10931,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10788,7 +10943,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10798,7 +10955,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10808,7 +10967,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10818,7 +10979,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10828,7 +10991,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10838,7 +11003,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10876,7 +11043,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10887,7 +11056,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10897,7 +11068,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10907,7 +11080,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10918,7 +11093,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10928,7 +11105,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10938,7 +11117,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10948,7 +11129,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10958,7 +11141,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10968,7 +11153,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10978,7 +11165,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -11017,6 +11206,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11222,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11046,6 +11237,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11059,6 +11251,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11073,6 +11266,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11281,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11100,6 +11295,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11309,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11126,6 +11323,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11139,6 +11337,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11152,6 +11351,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11199,7 +11399,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11416,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11228,7 +11432,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11241,7 +11447,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11255,7 +11463,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11479,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11282,7 +11494,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11509,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11308,7 +11524,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11321,7 +11539,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11334,7 +11554,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11382,7 +11604,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11621,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11411,7 +11637,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11424,7 +11652,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11438,7 +11668,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11684,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11465,7 +11699,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11714,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11491,7 +11729,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11504,7 +11744,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11517,7 +11759,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11735,6 +11979,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11993,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11760,6 +12006,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11772,6 +12019,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +12033,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +12046,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11809,6 +12059,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +12072,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11833,6 +12085,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11845,6 +12098,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11857,6 +12111,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +12156,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12170,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11926,6 +12183,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11938,6 +12196,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11951,6 +12210,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12223,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11975,6 +12236,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12249,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11999,6 +12262,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12011,6 +12275,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12023,6 +12288,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -12068,7 +12334,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12349,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12093,7 +12363,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12105,7 +12377,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12392,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12406,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12142,7 +12420,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12434,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12166,7 +12448,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12178,7 +12462,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12190,7 +12476,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12523,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12538,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12260,7 +12552,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12272,7 +12566,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12581,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12595,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12309,7 +12609,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12623,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12333,7 +12637,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12345,7 +12651,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12357,7 +12665,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12713,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12727,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12428,6 +12740,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12440,6 +12753,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12767,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12780,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12477,6 +12793,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12806,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12501,6 +12819,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12513,6 +12832,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12525,6 +12845,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12891,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12905,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12595,6 +12918,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12607,6 +12931,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12945,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12958,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12644,6 +12971,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12984,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12668,6 +12997,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12680,6 +13010,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12692,6 +13023,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +13068,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12749,7 +13083,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12761,7 +13097,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12773,7 +13111,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12786,7 +13126,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12798,7 +13140,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12810,7 +13154,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12822,7 +13168,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12834,7 +13182,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12846,7 +13196,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12858,7 +13210,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12903,7 +13257,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13272,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12928,7 +13286,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12940,7 +13300,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13315,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13329,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12977,7 +13343,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13357,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13001,7 +13371,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13013,7 +13385,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13025,7 +13399,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13446,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13461,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13095,7 +13475,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13107,7 +13489,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13504,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13518,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13144,7 +13532,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13546,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13168,7 +13560,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13180,7 +13574,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13192,7 +13588,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13635,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13650,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13262,7 +13664,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13274,7 +13678,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13693,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13707,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13311,7 +13721,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13735,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13335,7 +13749,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13347,7 +13763,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13359,7 +13777,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13824,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13839,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13429,7 +13853,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13441,7 +13867,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13882,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13896,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13478,7 +13910,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13924,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13502,7 +13938,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13514,7 +13952,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13526,7 +13966,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +14013,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +14028,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13596,7 +14042,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13608,7 +14056,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +14071,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +14085,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13645,7 +14099,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +14113,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13669,7 +14127,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13681,7 +14141,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13693,7 +14155,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +14202,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +14217,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13763,7 +14231,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13775,7 +14245,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +14260,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +14274,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13812,7 +14288,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14302,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13836,7 +14316,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13848,7 +14330,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13860,7 +14344,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14391,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14406,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13930,7 +14420,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13942,7 +14434,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14449,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14463,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13979,7 +14477,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14491,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14003,7 +14505,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14015,7 +14519,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14027,7 +14533,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14792,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14810,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14317,6 +14827,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14332,6 +14843,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14348,6 +14860,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14877,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14379,6 +14893,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14909,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14409,6 +14925,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14424,6 +14941,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14439,6 +14957,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14494,6 +15013,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +15031,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14527,6 +15048,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14542,6 +15064,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14558,6 +15081,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +15098,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14589,6 +15114,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +15130,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14619,6 +15146,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14634,6 +15162,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14649,6 +15178,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +15235,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +15254,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14738,7 +15272,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14753,7 +15289,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14769,7 +15307,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15325,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14800,7 +15342,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15359,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14830,7 +15376,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14845,7 +15393,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14860,7 +15410,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14916,7 +15468,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15487,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14949,7 +15505,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14964,7 +15522,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14980,7 +15540,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15558,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15011,7 +15575,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15592,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15041,7 +15609,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15056,7 +15626,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15071,7 +15643,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15128,6 +15702,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15720,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15161,6 +15737,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15176,6 +15753,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15192,6 +15770,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15787,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15223,6 +15803,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15819,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15253,6 +15835,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15268,6 +15851,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15283,6 +15867,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15339,6 +15924,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15942,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15372,6 +15959,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15387,6 +15975,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15403,6 +15992,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +16009,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15434,6 +16025,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +16041,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15464,6 +16057,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15479,6 +16073,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15494,6 +16089,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15549,7 +16145,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +16164,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15582,7 +16182,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15597,7 +16199,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15613,7 +16217,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +16235,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15644,7 +16252,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +16269,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15674,7 +16286,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15689,7 +16303,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15704,7 +16320,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15760,7 +16378,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16397,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15793,7 +16415,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15808,7 +16432,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15824,7 +16450,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16468,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15855,7 +16485,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16502,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15885,7 +16519,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15900,7 +16536,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15915,7 +16553,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15971,7 +16611,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16630,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,7 +16648,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16019,7 +16665,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16035,7 +16683,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16701,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16066,7 +16718,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16735,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16096,7 +16752,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16111,7 +16769,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16126,7 +16786,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16182,7 +16844,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16863,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16215,7 +16881,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16230,7 +16898,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16246,7 +16916,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16934,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16277,7 +16951,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16968,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16307,7 +16985,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16322,7 +17002,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16337,7 +17019,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16393,7 +17077,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +17096,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16426,7 +17114,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16441,7 +17131,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16457,7 +17149,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +17167,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16488,7 +17184,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +17201,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16518,7 +17218,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16533,7 +17235,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16548,7 +17252,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16604,7 +17310,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +17329,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,7 +17347,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16652,7 +17364,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16668,7 +17382,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +17400,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16699,7 +17417,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17434,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16729,7 +17451,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16744,7 +17468,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16759,7 +17485,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16815,7 +17543,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17562,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16848,7 +17580,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16863,7 +17597,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16879,7 +17615,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17633,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16910,7 +17650,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17667,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16940,7 +17684,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16955,7 +17701,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16970,7 +17718,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -17026,7 +17776,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17795,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17059,7 +17813,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -17074,7 +17830,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -17090,7 +17848,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17866,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17121,7 +17883,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17900,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17151,7 +17917,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17166,7 +17934,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -17181,7 +17951,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index bc2508411ed6b..274a2ab18aa10 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -43,6 +43,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -186,6 +187,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -330,6 +332,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_volatile_store_0: @@ -343,6 +346,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_store_0: @@ -355,6 +359,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_volatile_store_0: @@ -367,6 +372,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_volatile_store_0: @@ -380,6 +386,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_volatile_store_0: @@ -392,6 +399,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_volatile_store_0: @@ -404,6 +412,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_volatile_store_0: @@ -461,6 +470,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_volatile_store_1: @@ -476,6 +486,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_volatile_store_1: @@ -489,6 +500,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_volatile_store_1: @@ -502,6 +514,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_volatile_store_1: @@ -517,6 +530,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_volatile_store_1: @@ -532,6 +546,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_volatile_store_1: @@ -547,6 +562,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_volatile_store_1: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index b24622a48a16b..3a739284d55ba 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -366,6 +366,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +381,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -393,6 +395,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -405,6 +408,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -418,6 +422,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -431,6 +436,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -443,6 +449,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -455,6 +462,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -467,6 +475,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -479,6 +488,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -491,6 +501,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -536,7 +547,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -550,7 +563,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -563,7 +578,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -575,7 +592,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -588,7 +607,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -601,7 +622,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -613,7 +636,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -625,7 +650,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -637,7 +664,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -649,7 +678,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -661,7 +692,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -987,6 +1020,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -998,6 +1032,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1008,6 +1043,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1018,6 +1054,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1029,6 +1066,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1039,6 +1077,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1049,6 +1088,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1059,6 +1099,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1069,6 +1110,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1079,6 +1121,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1089,6 +1132,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1127,6 +1171,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1138,6 +1183,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1148,6 +1194,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1158,6 +1205,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1169,6 +1217,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1179,6 +1228,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1189,6 +1239,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1199,6 +1250,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1209,6 +1261,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1219,6 +1272,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1229,6 +1283,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1408,6 +1463,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_atomicrmw: @@ -1419,6 +1475,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_atomicrmw: @@ -1429,6 +1486,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acquire_atomicrmw: @@ -1439,6 +1497,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_atomicrmw: @@ -1450,6 +1509,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1460,6 +1520,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1470,6 +1531,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1480,6 +1542,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: @@ -1490,6 +1553,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acquire_atomicrmw: @@ -1500,6 +1564,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_atomicrmw: @@ -1510,6 +1575,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_atomicrmw: @@ -1547,6 +1613,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1558,6 +1625,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1568,6 +1636,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1578,6 +1647,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1589,6 +1659,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -1599,6 +1670,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1609,6 +1681,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1619,6 +1692,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1629,6 +1703,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1639,6 +1714,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1649,6 +1725,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -1687,7 +1764,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1698,7 +1777,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1708,7 +1789,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1718,7 +1801,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1729,7 +1814,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1739,7 +1826,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1749,7 +1838,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1759,7 +1850,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1769,7 +1862,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1779,7 +1874,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1789,7 +1886,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: @@ -1827,7 +1926,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1838,7 +1939,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1848,7 +1951,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1858,7 +1963,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1869,7 +1976,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1879,7 +1988,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1889,7 +2000,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1899,7 +2012,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1909,7 +2024,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1919,7 +2036,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1929,7 +2048,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: @@ -1968,6 +2089,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2105,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1997,6 +2120,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -2010,6 +2134,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -2024,6 +2149,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,6 +2164,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2051,6 +2178,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2064,6 +2192,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2077,6 +2206,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2090,6 +2220,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -2103,6 +2234,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2150,7 +2282,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2165,7 +2299,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2179,7 +2315,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -2192,7 +2330,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -2206,7 +2346,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2220,7 +2362,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2233,7 +2377,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2246,7 +2392,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2259,7 +2407,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2272,7 +2422,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -2285,7 +2437,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2333,7 +2487,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2348,7 +2504,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2362,7 +2520,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -2375,7 +2535,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -2389,7 +2551,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2403,7 +2567,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2416,7 +2582,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2429,7 +2597,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2442,7 +2612,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -2455,7 +2627,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -2468,7 +2642,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2686,6 +2862,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2699,6 +2876,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2711,6 +2889,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2723,6 +2902,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2736,6 +2916,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2748,6 +2929,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2760,6 +2942,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2772,6 +2955,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2784,6 +2968,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2796,6 +2981,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2808,6 +2994,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: @@ -2852,6 +3039,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -2865,6 +3053,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -2877,6 +3066,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2889,6 +3079,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -2902,6 +3093,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -2914,6 +3106,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2926,6 +3119,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2938,6 +3132,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2950,6 +3145,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2962,6 +3158,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -2974,6 +3171,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -3019,7 +3217,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3032,7 +3232,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3044,7 +3246,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3056,7 +3260,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3069,7 +3275,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3081,7 +3289,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3093,7 +3303,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3105,7 +3317,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3117,7 +3331,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3129,7 +3345,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3141,7 +3359,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: @@ -3186,7 +3406,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3199,7 +3421,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3211,7 +3435,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3223,7 +3449,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3236,7 +3464,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3248,7 +3478,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3260,7 +3492,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3272,7 +3506,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3284,7 +3520,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3296,7 +3534,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3308,7 +3548,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: @@ -3354,6 +3596,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3367,6 +3610,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3379,6 +3623,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3391,6 +3636,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3404,6 +3650,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3416,6 +3663,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3428,6 +3676,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3440,6 +3689,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3452,6 +3702,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3464,6 +3715,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3476,6 +3728,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: @@ -3521,6 +3774,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3534,6 +3788,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3546,6 +3801,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3558,6 +3814,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3571,6 +3828,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3583,6 +3841,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3595,6 +3854,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3607,6 +3867,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3619,6 +3880,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3631,6 +3893,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3643,6 +3906,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: @@ -3687,7 +3951,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3700,7 +3966,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3712,7 +3980,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3724,7 +3994,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3737,7 +4009,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3749,7 +4023,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3761,7 +4037,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3773,7 +4051,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3785,7 +4065,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3797,7 +4079,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3809,7 +4093,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: @@ -3854,7 +4140,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3867,7 +4155,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3879,7 +4169,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3891,7 +4183,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3904,7 +4198,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3916,7 +4212,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3928,7 +4226,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3940,7 +4240,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3952,7 +4254,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3964,7 +4268,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -3976,7 +4282,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: @@ -4021,7 +4329,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4034,7 +4344,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4046,7 +4358,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4058,7 +4372,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4071,7 +4387,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4083,7 +4401,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4095,7 +4415,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4107,7 +4429,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4119,7 +4443,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4131,7 +4457,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4143,7 +4471,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: @@ -4188,7 +4518,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4201,7 +4533,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4213,7 +4547,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4225,7 +4561,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4238,7 +4576,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4250,7 +4590,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4262,7 +4604,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4274,7 +4618,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4286,7 +4632,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4298,7 +4646,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4310,7 +4660,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: @@ -4355,7 +4707,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4368,7 +4722,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4380,7 +4736,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4392,7 +4750,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4405,7 +4765,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4417,7 +4779,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4429,7 +4793,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4441,7 +4807,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4453,7 +4821,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4465,7 +4835,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4477,7 +4849,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: @@ -4522,7 +4896,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4535,7 +4911,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4547,7 +4925,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4559,7 +4939,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4572,7 +4954,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4584,7 +4968,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4596,7 +4982,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4608,7 +4996,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4620,7 +5010,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4632,7 +5024,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4644,7 +5038,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: @@ -4689,7 +5085,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4702,7 +5100,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4714,7 +5114,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4726,7 +5128,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4739,7 +5143,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4751,7 +5157,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4763,7 +5171,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4775,7 +5185,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4787,7 +5199,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4799,7 +5213,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4811,7 +5227,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: @@ -4856,7 +5274,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4869,7 +5289,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4881,7 +5303,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4893,7 +5317,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4906,7 +5332,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4918,7 +5346,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4930,7 +5360,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4942,7 +5374,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4954,7 +5388,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4966,7 +5402,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4978,7 +5416,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: @@ -5235,6 +5675,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5252,6 +5693,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5268,6 +5710,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -5283,6 +5726,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -5299,6 +5743,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,6 +5760,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5330,6 +5776,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5345,6 +5792,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5360,6 +5808,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5375,6 +5824,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -5390,6 +5840,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -5445,6 +5896,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5462,6 +5914,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5478,6 +5931,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5493,6 +5947,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5509,6 +5964,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5525,6 +5981,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5540,6 +5997,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5555,6 +6013,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5570,6 +6029,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5585,6 +6045,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,6 +6061,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5656,7 +6118,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5673,7 +6137,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5689,7 +6155,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -5704,7 +6172,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -5720,7 +6190,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5736,7 +6208,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5751,7 +6225,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5766,7 +6242,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5781,7 +6259,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5796,7 +6276,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -5811,7 +6293,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -5867,7 +6351,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5884,7 +6370,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -5900,7 +6388,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -5915,7 +6405,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -5931,7 +6423,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -5947,7 +6441,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5962,7 +6458,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5977,7 +6475,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -5992,7 +6492,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6007,7 +6509,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6022,7 +6526,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6079,6 +6585,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6096,6 +6603,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6112,6 +6620,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6127,6 +6636,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6143,6 +6653,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6159,6 +6670,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6174,6 +6686,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6189,6 +6702,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6204,6 +6718,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6219,6 +6734,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6234,6 +6750,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6290,6 +6807,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,6 +6825,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6323,6 +6842,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6338,6 +6858,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6354,6 +6875,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6370,6 +6892,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6385,6 +6908,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6400,6 +6924,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6415,6 +6940,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6430,6 +6956,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6445,6 +6972,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6500,7 +7028,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6517,7 +7047,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6533,7 +7065,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6548,7 +7082,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6564,7 +7100,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6580,7 +7118,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6595,7 +7135,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6610,7 +7152,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6625,7 +7169,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6640,7 +7186,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6655,7 +7203,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6711,7 +7261,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6728,7 +7280,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6744,7 +7298,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6759,7 +7315,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6775,7 +7333,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -6791,7 +7351,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6806,7 +7368,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6821,7 +7385,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6836,7 +7402,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -6851,7 +7419,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -6866,7 +7436,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -6922,7 +7494,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6939,7 +7513,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -6955,7 +7531,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -6970,7 +7548,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -6986,7 +7566,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7002,7 +7584,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7017,7 +7601,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7032,7 +7618,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7047,7 +7635,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7062,7 +7652,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7077,7 +7669,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7133,7 +7727,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7150,7 +7746,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7166,7 +7764,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7181,7 +7781,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7197,7 +7799,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7213,7 +7817,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7228,7 +7834,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7243,7 +7851,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7258,7 +7868,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7273,7 +7885,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7288,7 +7902,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7344,7 +7960,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7361,7 +7979,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7377,7 +7997,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7392,7 +8014,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7408,7 +8032,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7424,7 +8050,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7439,7 +8067,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7454,7 +8084,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7469,7 +8101,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7484,7 +8118,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7499,7 +8135,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7555,7 +8193,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7572,7 +8212,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7588,7 +8230,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7603,7 +8247,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7619,7 +8265,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7635,7 +8283,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7650,7 +8300,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7665,7 +8317,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7680,7 +8334,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7695,7 +8351,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7710,7 +8368,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7766,7 +8426,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7783,7 +8445,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7799,7 +8463,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -7814,7 +8480,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -7830,7 +8498,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -7846,7 +8516,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7861,7 +8533,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7876,7 +8550,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7891,7 +8567,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -7906,7 +8584,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7921,7 +8601,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7977,7 +8659,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,7 +8678,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8010,7 +8696,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -8025,7 +8713,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -8041,7 +8731,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8057,7 +8749,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8072,7 +8766,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8087,7 +8783,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8102,7 +8800,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8117,7 +8817,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8132,7 +8834,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8529,6 +9233,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8543,6 +9248,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8556,6 +9262,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -8568,6 +9275,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -8581,6 +9289,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8594,6 +9303,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8606,6 +9316,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8618,6 +9329,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8630,6 +9342,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8642,6 +9355,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8654,6 +9368,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8699,7 +9414,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8713,7 +9430,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8726,7 +9445,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -8738,7 +9459,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -8751,7 +9474,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -8764,7 +9489,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8776,7 +9503,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8788,7 +9517,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8800,7 +9531,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -8812,7 +9545,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8824,7 +9559,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -9150,6 +9887,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9161,6 +9899,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9171,6 +9910,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -9181,6 +9921,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -9192,6 +9933,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9202,6 +9944,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9212,6 +9955,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -9222,6 +9966,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9232,6 +9977,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -9242,6 +9988,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -9252,6 +9999,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -9290,6 +10038,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9301,6 +10050,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9311,6 +10061,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -9321,6 +10072,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -9332,6 +10084,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9342,6 +10095,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9352,6 +10106,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -9362,6 +10117,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9372,6 +10128,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -9382,6 +10139,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -9392,6 +10150,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -9571,6 +10330,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9582,6 +10342,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9592,6 +10353,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9602,6 +10364,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9613,6 +10376,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9623,6 +10387,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9633,6 +10398,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9643,6 +10409,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9653,6 +10420,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9663,6 +10431,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9673,6 +10442,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: @@ -9710,6 +10480,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -9721,6 +10492,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -9731,6 +10503,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -9741,6 +10514,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -9752,6 +10526,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9762,6 +10537,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9772,6 +10548,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -9782,6 +10559,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -9792,6 +10570,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -9802,6 +10581,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -9812,6 +10592,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -9850,7 +10631,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9861,7 +10644,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9871,7 +10656,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9881,7 +10668,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9892,7 +10681,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9902,7 +10693,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9912,7 +10705,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9922,7 +10717,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9932,7 +10729,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9942,7 +10741,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9952,7 +10753,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: @@ -9990,7 +10793,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10001,7 +10806,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10011,7 +10818,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10021,7 +10830,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10032,7 +10843,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10042,7 +10855,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10052,7 +10867,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10062,7 +10879,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10072,7 +10891,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10082,7 +10903,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10092,7 +10915,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: @@ -10131,6 +10956,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10146,6 +10972,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10160,6 +10987,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -10173,6 +11001,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -10187,6 +11016,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10201,6 +11031,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10214,6 +11045,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10227,6 +11059,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10240,6 +11073,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10253,6 +11087,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -10266,6 +11101,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10313,7 +11149,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10328,7 +11166,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10342,7 +11182,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -10355,7 +11197,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -10369,7 +11213,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,7 +11229,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10396,7 +11244,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10409,7 +11259,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10422,7 +11274,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10435,7 +11289,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -10448,7 +11304,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10496,7 +11354,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10511,7 +11371,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10525,7 +11387,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -10538,7 +11402,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -10552,7 +11418,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,7 +11434,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10579,7 +11449,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10592,7 +11464,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10605,7 +11479,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -10618,7 +11494,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -10631,7 +11509,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10849,6 +11729,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10862,6 +11743,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10874,6 +11756,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10886,6 +11769,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10899,6 +11783,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10911,6 +11796,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10923,6 +11809,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10935,6 +11822,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10947,6 +11835,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10959,6 +11848,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -10971,6 +11861,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -11015,6 +11906,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11028,6 +11920,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11040,6 +11933,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11052,6 +11946,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11065,6 +11960,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11077,6 +11973,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11089,6 +11986,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11101,6 +11999,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11113,6 +12012,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -11125,6 +12025,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -11137,6 +12038,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -11182,7 +12084,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11195,7 +12099,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11207,7 +12113,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11219,7 +12127,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11232,7 +12142,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11244,7 +12156,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11256,7 +12170,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11268,7 +12184,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11280,7 +12198,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11292,7 +12212,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11304,7 +12226,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -11349,7 +12273,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11362,7 +12288,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11374,7 +12302,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11386,7 +12316,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11399,7 +12331,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11411,7 +12345,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11423,7 +12359,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11435,7 +12373,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11447,7 +12387,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11459,7 +12401,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11471,7 +12415,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -11517,6 +12463,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11530,6 +12477,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11542,6 +12490,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11554,6 +12503,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11567,6 +12517,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11579,6 +12530,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11591,6 +12543,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11603,6 +12556,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11615,6 +12569,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11627,6 +12582,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11639,6 +12595,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -11684,6 +12641,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11697,6 +12655,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11709,6 +12668,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11721,6 +12681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11734,6 +12695,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11746,6 +12708,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11758,6 +12721,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11770,6 +12734,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11782,6 +12747,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11794,6 +12760,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11806,6 +12773,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: @@ -11850,7 +12818,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11863,7 +12833,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11875,7 +12847,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11887,7 +12861,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11900,7 +12876,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11912,7 +12890,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11924,7 +12904,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11936,7 +12918,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11948,7 +12932,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11960,7 +12946,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -11972,7 +12960,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: @@ -12017,7 +13007,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12030,7 +13022,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12042,7 +13036,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12054,7 +13050,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12067,7 +13065,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12079,7 +13079,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12091,7 +13093,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12103,7 +13107,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12115,7 +13121,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12127,7 +13135,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12139,7 +13149,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -12184,7 +13196,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12197,7 +13211,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12209,7 +13225,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12221,7 +13239,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12234,7 +13254,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12246,7 +13268,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12258,7 +13282,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12270,7 +13296,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12282,7 +13310,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12294,7 +13324,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12306,7 +13338,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -12351,7 +13385,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12364,7 +13400,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12376,7 +13414,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12388,7 +13428,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12401,7 +13443,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12413,7 +13457,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12425,7 +13471,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12437,7 +13485,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12449,7 +13499,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12461,7 +13513,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12473,7 +13527,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -12518,7 +13574,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12531,7 +13589,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12543,7 +13603,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12555,7 +13617,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12568,7 +13632,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12580,7 +13646,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12592,7 +13660,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12604,7 +13674,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12616,7 +13688,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12628,7 +13702,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12640,7 +13716,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -12685,7 +13763,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12698,7 +13778,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12710,7 +13792,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12722,7 +13806,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12735,7 +13821,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12747,7 +13835,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12759,7 +13849,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12771,7 +13863,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12783,7 +13877,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12795,7 +13891,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12807,7 +13905,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: @@ -12852,7 +13952,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12865,7 +13967,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12877,7 +13981,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12889,7 +13995,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12902,7 +14010,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12914,7 +14024,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12926,7 +14038,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12938,7 +14052,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12950,7 +14066,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12962,7 +14080,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -12974,7 +14094,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -13019,7 +14141,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13032,7 +14156,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13044,7 +14170,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13056,7 +14184,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13069,7 +14199,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13081,7 +14213,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13093,7 +14227,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13105,7 +14241,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13117,7 +14255,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13129,7 +14269,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13141,7 +14283,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -13398,6 +14542,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13415,6 +14560,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13431,6 +14577,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -13446,6 +14593,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -13462,6 +14610,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13478,6 +14627,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13493,6 +14643,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13508,6 +14659,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13523,6 +14675,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13538,6 +14691,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -13553,6 +14707,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -13608,6 +14763,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13625,6 +14781,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13641,6 +14798,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13656,6 +14814,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13672,6 +14831,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13688,6 +14848,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13703,6 +14864,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13718,6 +14880,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13733,6 +14896,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13748,6 +14912,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13763,6 +14928,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13819,7 +14985,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -13836,7 +15004,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13852,7 +15022,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -13867,7 +15039,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -13883,7 +15057,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -13899,7 +15075,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13914,7 +15092,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13929,7 +15109,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13944,7 +15126,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -13959,7 +15143,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -13974,7 +15160,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14030,7 +15218,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14047,7 +15237,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14063,7 +15255,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14078,7 +15272,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14094,7 +15290,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14110,7 +15308,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14125,7 +15325,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14140,7 +15342,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14155,7 +15359,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14170,7 +15376,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14185,7 +15393,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14242,6 +15452,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14259,6 +15470,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14275,6 +15487,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14290,6 +15503,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14306,6 +15520,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14322,6 +15537,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14337,6 +15553,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14352,6 +15569,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14367,6 +15585,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14382,6 +15601,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14397,6 +15617,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14453,6 +15674,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14470,6 +15692,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14486,6 +15709,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14501,6 +15725,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14517,6 +15742,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14533,6 +15759,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14548,6 +15775,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14563,6 +15791,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14578,6 +15807,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14593,6 +15823,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14608,6 +15839,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14663,7 +15895,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14680,7 +15914,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14696,7 +15932,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14711,7 +15949,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14727,7 +15967,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14743,7 +15985,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14758,7 +16002,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14773,7 +16019,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14788,7 +16036,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14803,7 +16053,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14818,7 +16070,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14874,7 +16128,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14891,7 +16147,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14907,7 +16165,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14922,7 +16182,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14938,7 +16200,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14954,7 +16218,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14969,7 +16235,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14984,7 +16252,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14999,7 +16269,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15014,7 +16286,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15029,7 +16303,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15085,7 +16361,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15102,7 +16380,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15118,7 +16398,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15133,7 +16415,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15149,7 +16433,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15165,7 +16451,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15180,7 +16468,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15195,7 +16485,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15210,7 +16502,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15225,7 +16519,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15240,7 +16536,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15296,7 +16594,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15313,7 +16613,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15329,7 +16631,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15344,7 +16648,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15360,7 +16666,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15376,7 +16684,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15391,7 +16701,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15406,7 +16718,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15421,7 +16735,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15436,7 +16752,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15451,7 +16769,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15507,7 +16827,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15524,7 +16846,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15540,7 +16864,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15555,7 +16881,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15571,7 +16899,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15587,7 +16917,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15602,7 +16934,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15617,7 +16951,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15632,7 +16968,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15647,7 +16985,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15662,7 +17002,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15718,7 +17060,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15735,7 +17079,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15751,7 +17097,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15766,7 +17114,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15782,7 +17132,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15798,7 +17150,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15813,7 +17167,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15828,7 +17184,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15843,7 +17201,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15858,7 +17218,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15873,7 +17235,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15929,7 +17293,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15946,7 +17312,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15962,7 +17330,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15977,7 +17347,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15993,7 +17365,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16009,7 +17383,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16024,7 +17400,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16039,7 +17417,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16054,7 +17434,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16069,7 +17451,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16084,7 +17468,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16140,7 +17526,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16157,7 +17545,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16173,7 +17563,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16188,7 +17580,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16204,7 +17598,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16220,7 +17616,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16235,7 +17633,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16250,7 +17650,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16265,7 +17667,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16280,7 +17684,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16295,7 +17701,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 62d7f4801baf8..4a8ad597ac0f3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -444,6 +444,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -469,6 +470,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -628,6 +630,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -655,6 +658,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,6 +1545,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1563,6 +1568,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -1863,6 +1869,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1887,6 +1894,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2037,6 +2045,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2061,6 +2070,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2218,6 +2228,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2245,6 +2256,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2414,6 +2426,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,6 +2456,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,6 +2634,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,6 +2664,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,6 +3000,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3010,6 +3027,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3360,6 +3378,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3388,6 +3407,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3561,6 +3581,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3589,6 +3610,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3754,6 +3776,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3780,6 +3803,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3937,6 +3961,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3963,6 +3988,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4128,6 +4154,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4156,6 +4183,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4329,6 +4357,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4357,6 +4386,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4530,6 +4560,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4558,6 +4589,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4731,6 +4763,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4759,6 +4792,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4932,6 +4966,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4960,6 +4995,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5133,6 +5169,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5161,6 +5198,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5334,6 +5372,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5362,6 +5401,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5535,6 +5575,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5563,6 +5604,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5954,6 +5996,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5985,6 +6028,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6407,6 +6451,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6440,6 +6485,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6641,6 +6687,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6674,6 +6721,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6867,6 +6915,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6898,6 +6947,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7083,6 +7133,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7114,6 +7165,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7307,6 +7359,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +7393,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,6 +7595,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +7629,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7775,6 +7831,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7808,6 +7865,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8009,6 +8067,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8042,6 +8101,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8243,6 +8303,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8276,6 +8337,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8477,6 +8539,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8510,6 +8573,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8711,6 +8775,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8744,6 +8809,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8945,6 +9011,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8978,6 +9045,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9415,6 +9483,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9429,6 +9498,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9442,6 +9512,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9454,6 +9525,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9467,6 +9539,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9480,6 +9553,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9492,6 +9566,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9504,6 +9579,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9516,6 +9592,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9528,6 +9605,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9540,6 +9618,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -9585,7 +9664,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9680,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,7 +9695,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9624,7 +9709,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9637,7 +9724,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9739,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9662,7 +9753,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9767,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9686,7 +9781,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9698,7 +9795,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9710,7 +9809,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10036,6 +10137,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10149,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10057,6 +10160,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10067,6 +10171,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10078,6 +10183,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10194,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10098,6 +10205,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10216,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10118,6 +10227,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10128,6 +10238,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10138,6 +10249,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10176,6 +10288,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10300,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10197,6 +10311,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10207,6 +10322,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10218,6 +10334,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10345,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10238,6 +10356,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10367,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10258,6 +10378,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10268,6 +10389,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10278,6 +10400,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10457,6 +10580,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10468,6 +10592,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10478,6 +10603,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10488,6 +10614,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10499,6 +10626,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10509,6 +10637,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10519,6 +10648,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10529,6 +10659,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10539,6 +10670,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10549,6 +10681,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10559,6 +10692,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: @@ -10596,6 +10730,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10742,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10617,6 +10753,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10627,6 +10764,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10638,6 +10776,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10787,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10658,6 +10798,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10809,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10678,6 +10820,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10688,6 +10831,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10698,6 +10842,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10736,7 +10881,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10747,7 +10894,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10757,7 +10906,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10767,7 +10918,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10778,7 +10931,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10788,7 +10943,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10798,7 +10955,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10808,7 +10967,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10818,7 +10979,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10828,7 +10991,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10838,7 +11003,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10876,7 +11043,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10887,7 +11056,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10897,7 +11068,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10907,7 +11080,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10918,7 +11093,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10928,7 +11105,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10938,7 +11117,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10948,7 +11129,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10958,7 +11141,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10968,7 +11153,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10978,7 +11165,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -11017,6 +11206,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11222,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11046,6 +11237,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11059,6 +11251,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11073,6 +11266,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11087,6 +11281,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11100,6 +11295,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11113,6 +11309,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11126,6 +11323,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11139,6 +11337,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11152,6 +11351,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11199,7 +11399,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11416,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11228,7 +11432,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11241,7 +11447,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11255,7 +11463,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11479,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11282,7 +11494,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11509,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11308,7 +11524,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11321,7 +11539,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11334,7 +11554,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11382,7 +11604,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11621,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11411,7 +11637,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11424,7 +11652,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11438,7 +11668,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11684,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11465,7 +11699,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11714,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11491,7 +11729,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11504,7 +11744,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11517,7 +11759,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11735,6 +11979,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11748,6 +11993,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11760,6 +12006,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11772,6 +12019,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11785,6 +12033,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11797,6 +12046,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11809,6 +12059,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11821,6 +12072,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11833,6 +12085,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11845,6 +12098,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11857,6 +12111,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -11901,6 +12156,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12170,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11926,6 +12183,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11938,6 +12196,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11951,6 +12210,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12223,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11975,6 +12236,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12249,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11999,6 +12262,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12011,6 +12275,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12023,6 +12288,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -12068,7 +12334,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12349,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12093,7 +12363,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12105,7 +12377,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12392,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12406,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12142,7 +12420,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12434,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12166,7 +12448,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12178,7 +12462,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12190,7 +12476,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12523,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12538,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12260,7 +12552,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12272,7 +12566,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12581,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12595,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12309,7 +12609,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12623,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12333,7 +12637,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12345,7 +12651,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12357,7 +12665,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12403,6 +12713,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12416,6 +12727,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12428,6 +12740,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12440,6 +12753,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12453,6 +12767,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12465,6 +12780,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12477,6 +12793,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12489,6 +12806,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12501,6 +12819,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12513,6 +12832,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12525,6 +12845,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: @@ -12570,6 +12891,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12583,6 +12905,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12595,6 +12918,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12607,6 +12931,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12620,6 +12945,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12632,6 +12958,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12644,6 +12971,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12656,6 +12984,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12668,6 +12997,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12680,6 +13010,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12692,6 +13023,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: @@ -12736,7 +13068,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12749,7 +13083,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12761,7 +13097,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12773,7 +13111,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12786,7 +13126,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12798,7 +13140,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12810,7 +13154,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12822,7 +13168,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12834,7 +13182,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12846,7 +13196,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12858,7 +13210,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12903,7 +13257,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13272,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12928,7 +13286,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12940,7 +13300,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13315,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13329,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12977,7 +13343,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13357,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13001,7 +13371,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13013,7 +13385,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13025,7 +13399,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13446,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13461,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13095,7 +13475,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13107,7 +13489,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13504,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13518,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13144,7 +13532,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13546,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13168,7 +13560,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13180,7 +13574,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13192,7 +13588,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13237,7 +13635,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13250,7 +13650,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13262,7 +13664,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13274,7 +13678,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13287,7 +13693,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13299,7 +13707,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13311,7 +13721,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13323,7 +13735,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13335,7 +13749,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13347,7 +13763,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13359,7 +13777,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: @@ -13404,7 +13824,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13417,7 +13839,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13429,7 +13853,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13441,7 +13867,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13454,7 +13882,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13466,7 +13896,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13478,7 +13910,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13490,7 +13924,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13502,7 +13938,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13514,7 +13952,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13526,7 +13966,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: @@ -13571,7 +14013,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +14028,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13596,7 +14042,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13608,7 +14056,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +14071,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +14085,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13645,7 +14099,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +14113,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13669,7 +14127,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13681,7 +14141,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13693,7 +14155,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +14202,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +14217,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13763,7 +14231,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13775,7 +14245,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +14260,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +14274,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13812,7 +14288,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14302,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13836,7 +14316,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13848,7 +14330,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13860,7 +14344,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14391,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14406,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13930,7 +14420,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13942,7 +14434,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14449,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14463,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13979,7 +14477,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14491,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14003,7 +14505,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14015,7 +14519,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14027,7 +14533,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14284,6 +14792,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14301,6 +14810,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14317,6 +14827,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14332,6 +14843,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14348,6 +14860,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14364,6 +14877,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14379,6 +14893,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14394,6 +14909,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14409,6 +14925,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14424,6 +14941,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14439,6 +14957,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14494,6 +15013,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +15031,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14527,6 +15048,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14542,6 +15064,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14558,6 +15081,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +15098,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14589,6 +15114,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +15130,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14619,6 +15146,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14634,6 +15162,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14649,6 +15178,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +15235,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +15254,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14738,7 +15272,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14753,7 +15289,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14769,7 +15307,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15325,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14800,7 +15342,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15359,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14830,7 +15376,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14845,7 +15393,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14860,7 +15410,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14916,7 +15468,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15487,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14949,7 +15505,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14964,7 +15522,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14980,7 +15540,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15558,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15011,7 +15575,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15592,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15041,7 +15609,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15056,7 +15626,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15071,7 +15643,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15128,6 +15702,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15145,6 +15720,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15161,6 +15737,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15176,6 +15753,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15192,6 +15770,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15208,6 +15787,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15223,6 +15803,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15238,6 +15819,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15253,6 +15835,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15268,6 +15851,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15283,6 +15867,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15339,6 +15924,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15356,6 +15942,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15372,6 +15959,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15387,6 +15975,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15403,6 +15992,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15419,6 +16009,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15434,6 +16025,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15449,6 +16041,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15464,6 +16057,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15479,6 +16073,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15494,6 +16089,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15549,7 +16145,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +16164,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15582,7 +16182,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15597,7 +16199,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15613,7 +16217,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +16235,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15644,7 +16252,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +16269,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15674,7 +16286,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15689,7 +16303,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15704,7 +16320,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15760,7 +16378,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16397,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15793,7 +16415,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15808,7 +16432,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15824,7 +16450,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16468,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15855,7 +16485,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16502,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15885,7 +16519,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15900,7 +16536,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15915,7 +16553,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15971,7 +16611,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16630,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,7 +16648,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16019,7 +16665,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16035,7 +16683,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16701,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16066,7 +16718,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16735,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16096,7 +16752,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16111,7 +16769,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16126,7 +16786,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16182,7 +16844,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16199,7 +16863,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16215,7 +16881,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16230,7 +16898,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16246,7 +16916,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16262,7 +16934,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16277,7 +16951,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16292,7 +16968,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16307,7 +16985,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16322,7 +17002,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16337,7 +17019,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16393,7 +17077,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16410,7 +17096,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16426,7 +17114,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16441,7 +17131,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16457,7 +17149,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16473,7 +17167,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16488,7 +17184,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16503,7 +17201,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16518,7 +17218,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16533,7 +17235,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16548,7 +17252,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16604,7 +17310,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +17329,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,7 +17347,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16652,7 +17364,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16668,7 +17382,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +17400,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16699,7 +17417,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17434,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16729,7 +17451,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16744,7 +17468,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16759,7 +17485,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16815,7 +17543,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17562,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16848,7 +17580,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16863,7 +17597,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16879,7 +17615,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17633,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16910,7 +17650,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17667,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16940,7 +17684,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16955,7 +17701,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16970,7 +17718,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -17026,7 +17776,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17795,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17059,7 +17813,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -17074,7 +17830,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -17090,7 +17848,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17866,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17121,7 +17883,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17900,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17151,7 +17917,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17166,7 +17934,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -17181,7 +17951,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir index 56dd95e373dc6..0101ef8f8455a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -70,6 +70,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -97,7 +98,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -182,6 +185,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -209,7 +213,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -294,6 +300,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -321,7 +328,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -406,6 +415,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -433,7 +443,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -518,6 +530,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -545,7 +558,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -622,6 +637,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -646,6 +662,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -718,6 +735,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -742,6 +760,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -814,6 +833,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -838,6 +858,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -910,6 +931,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -934,6 +956,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1006,6 +1029,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1030,6 +1054,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1103,6 +1128,7 @@ body: | ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) @@ -1126,6 +1152,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1150,7 +1177,9 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) @@ -1174,7 +1203,9 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index df4193969f8a0..c5c07dd74db1c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -396,6 +396,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -411,6 +412,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; @@ -442,6 +444,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -454,6 +457,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; @@ -549,6 +553,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -565,6 +570,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: s_endpgm ; @@ -602,6 +608,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off dlc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -618,6 +625,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off dlc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir index 36a244f6250db..2f09fcb0b6b2c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir @@ -70,6 +70,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -97,7 +98,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -182,6 +185,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -209,7 +213,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -294,6 +300,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -321,7 +328,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -406,6 +415,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -433,7 +443,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -518,6 +530,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -545,7 +558,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -622,6 +637,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -646,6 +662,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -718,6 +735,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -742,6 +760,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -814,6 +833,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -838,6 +858,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -910,6 +931,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -934,6 +956,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1006,6 +1029,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1030,6 +1054,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1103,6 +1128,7 @@ body: | ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) @@ -1126,6 +1152,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1150,7 +1177,9 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) @@ -1174,7 +1203,9 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, align 8, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index 8426224d9dd50..51e9d67b0b77f 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -335,8 +335,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off -; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) +; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: global_store_dword v[11:12], v0, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] @@ -381,8 +381,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off -; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) +; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: global_store_dword v[11:12], v0, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] @@ -430,8 +430,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc -; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) +; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v0, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] @@ -479,8 +479,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc -; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) +; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v0, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 1480743e435ff..0c77f290df1cd 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -346,8 +346,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off -; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) +; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: global_store_dword v[11:12], v0, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] @@ -392,8 +392,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off -; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) +; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: global_store_dword v[11:12], v0, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] @@ -441,8 +441,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc -; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) +; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v0, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] @@ -490,8 +490,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc -; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) +; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v0, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index d2394bab82c77..df6e1ec50fcd5 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -944,6 +944,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42 ; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE32-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23] @@ -1054,6 +1055,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE64-O0-NEXT: v_mov_b32_e32 v3, 42 ; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 +; WAVE64-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; WAVE64-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; WAVE64-O0-NEXT: s_mov_b64 s[2:3], s[26:27] @@ -1165,6 +1167,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23] @@ -1350,6 +1353,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_writelane_b32 v33, s16, 1 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; WAVE32-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-O0-NEXT: s_mov_b64 s[22:23], s[2:3] ; WAVE32-O0-NEXT: s_mov_b64 s[20:21], s[0:1] @@ -1461,6 +1465,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: v_writelane_b32 v33, s16, 1 ; WAVE64-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; WAVE64-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; WAVE64-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-O0-NEXT: s_mov_b64 s[22:23], s[2:3] ; WAVE64-O0-NEXT: s_mov_b64 s[20:21], s[0:1] @@ -1572,6 +1577,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s16, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[22:23], s[2:3] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[20:21], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 69cc63eba6243..4c4cf4a273e7f 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -83,6 +83,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_trap 2 ; HSA-TRAP-GFX1100-O0-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL) @@ -248,6 +249,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 3 ; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm ; HSA-TRAP-GFX1100-O0-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 @@ -382,6 +384,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX1100-O0-NEXT: scratch_load_b32 v1, off, off offset:4 ; 4-byte Folded Reload ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm ; HSA-TRAP-GFX1100-O0-NEXT: .LBB2_2: @@ -482,10 +485,12 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_trap 3 ; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 2 ; HSA-TRAP-GFX1100-O0-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; HSA-TRAP-GFX1100-O0-NEXT: s_endpgm store volatile i32 1, ptr addrspace(1) %arg0 diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir index acf8bd3a6ab56..8795f28cd4420 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir @@ -36,6 +36,7 @@ body: | ; GFX12-NEXT: S_WAIT_KMCNT_soft 0 ; GFX12-NEXT: S_WAIT_STORECNT_soft 0 ; GFX12-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1) + ; GFX12-NEXT: S_WAIT_LOADCNT_soft 63 ; GFX12-NEXT: S_WAIT_STORECNT_soft 0 ; GFX12-NEXT: S_ENDPGM 0 GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)