Skip to content

Commit e6831c6

Browse files
committed
[AMDGPU] optimize trivial soft wait counts
1 parent 35911ce commit e6831c6

24 files changed

+521
-1889
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -338,8 +338,8 @@ class WaitcntBrackets {
338338
const MachineOperand &Op) const;
339339

340340
bool counterOutOfOrder(InstCounterType T) const;
341-
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
342-
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
341+
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, bool OptNone) const;
342+
void simplifyWaitcnt(InstCounterType T, unsigned &Count, bool OptNone) const;
343343

344344
void determineWait(InstCounterType T, RegInterval Interval,
345345
AMDGPU::Waitcnt &Wait) const;
@@ -1164,22 +1164,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11641164

11651165
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
11661166
/// whether a waitcnt instruction is needed at all.
1167-
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1168-
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1169-
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1170-
simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1171-
simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1172-
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1173-
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1174-
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1175-
simplifyWaitcnt(X_CNT, Wait.XCnt);
1167+
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
1168+
bool OptNone) const {
1169+
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt, OptNone);
1170+
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt, OptNone);
1171+
simplifyWaitcnt(DS_CNT, Wait.DsCnt, OptNone);
1172+
simplifyWaitcnt(STORE_CNT, Wait.StoreCnt, OptNone);
1173+
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt, OptNone);
1174+
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt, OptNone);
1175+
simplifyWaitcnt(KM_CNT, Wait.KmCnt, OptNone);
1176+
simplifyWaitcnt(X_CNT, Wait.XCnt, OptNone);
11761177
}
11771178

1178-
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1179-
unsigned &Count) const {
1179+
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count,
1180+
bool OptNone) const {
11801181
// The number of outstanding events for this type, T, can be calculated
11811182
// as (UB - LB). If the current Count is greater than or equal to the number
11821183
// of outstanding events, then the wait for this counter is redundant.
1184+
//
1185+
// For counts that are at max value or above, try this even when optimizations
1186+
// are disabled. This helps remove max waitcnt's that are inserted by the
1187+
// memory legalizer by default, but does not optimize actual waitcnt's that
1188+
// are otherwise inserted by the memory legalizer or a previous pass of the
1189+
// inserter. The corner case is when a max waitcnt was optimized away although
1190+
// it was not just a default, but was deliberately chosen. This only
1191+
// marginally affects the usefulness of OptNone.
1192+
if (Count < getWaitCountMax(T) && OptNone)
1193+
return;
11831194
if (Count >= getScoreRange(T))
11841195
Count = ~0u;
11851196
}
@@ -1363,19 +1374,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13631374
}
13641375

13651376
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1366-
bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1377+
bool OpcodeIsSoft = Opcode != II.getOpcode();
13671378

13681379
// Update required wait count. If this is a soft waitcnt (= it was added
13691380
// by an earlier pass), it may be entirely removed.
13701381
if (Opcode == AMDGPU::S_WAITCNT) {
13711382
unsigned IEnc = II.getOperand(0).getImm();
13721383
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1373-
if (TrySimplify)
1374-
ScoreBrackets.simplifyWaitcnt(OldWait);
1384+
if (OpcodeIsSoft)
1385+
ScoreBrackets.simplifyWaitcnt(OldWait, OptNone);
13751386
Wait = Wait.combined(OldWait);
13761387

13771388
// Merge consecutive waitcnt of the same type by erasing multiples.
1378-
if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1389+
if (WaitcntInstr ||
1390+
(!Wait.hasWaitExceptStoreCnt() && OpcodeIsSoft && !OptNone)) {
13791391
II.eraseFromParent();
13801392
Modified = true;
13811393
} else
@@ -1386,11 +1398,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13861398

13871399
unsigned OldVSCnt =
13881400
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1389-
if (TrySimplify)
1390-
ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1401+
if (OpcodeIsSoft)
1402+
ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt,
1403+
OptNone);
13911404
Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
13921405

1393-
if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1406+
if (WaitcntVsCntInstr ||
1407+
(!Wait.hasWaitStoreCnt() && OpcodeIsSoft && !OptNone)) {
13941408
II.eraseFromParent();
13951409
Modified = true;
13961410
} else
@@ -1528,7 +1542,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15281542
// by an earlier pass), it may be entirely removed.
15291543

15301544
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1531-
bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1545+
bool OpcodeIsSoft = Opcode != II.getOpcode();
15321546

15331547
// Don't crash if the programmer used legacy waitcnt intrinsics, but don't
15341548
// attempt to do more than that either.
@@ -1539,25 +1553,25 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15391553
unsigned OldEnc =
15401554
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
15411555
AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1542-
if (TrySimplify)
1543-
ScoreBrackets.simplifyWaitcnt(OldWait);
1556+
if (OpcodeIsSoft)
1557+
ScoreBrackets.simplifyWaitcnt(OldWait, OptNone);
15441558
Wait = Wait.combined(OldWait);
15451559
UpdatableInstr = &CombinedLoadDsCntInstr;
15461560
} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
15471561
unsigned OldEnc =
15481562
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
15491563
AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1550-
if (TrySimplify)
1551-
ScoreBrackets.simplifyWaitcnt(OldWait);
1564+
if (OpcodeIsSoft)
1565+
ScoreBrackets.simplifyWaitcnt(OldWait, OptNone);
15521566
Wait = Wait.combined(OldWait);
15531567
UpdatableInstr = &CombinedStoreDsCntInstr;
15541568
} else {
15551569
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
15561570
assert(CT.has_value());
15571571
unsigned OldCnt =
15581572
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1559-
if (TrySimplify)
1560-
ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1573+
if (OpcodeIsSoft)
1574+
ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt, OptNone);
15611575
addWait(Wait, CT.value(), OldCnt);
15621576
UpdatableInstr = &WaitInstrs[CT.value()];
15631577
}
@@ -2009,7 +2023,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20092023
}
20102024

20112025
// Verify that the wait is actually needed.
2012-
ScoreBrackets.simplifyWaitcnt(Wait);
2026+
ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false);
20132027

20142028
// When forcing emit, we need to skip terminators because that would break the
20152029
// terminators of the MBB if we emit a waitcnt between terminators.
@@ -2238,7 +2252,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
22382252
NeedsEndPGMCheck = true;
22392253
}
22402254

2241-
ScoreBrackets.simplifyWaitcnt(Wait);
2255+
ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false);
22422256

22432257
auto SuccessorIt = std::next(Inst.getIterator());
22442258
bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,

llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
8484
;
8585
; GFX12-CU-LABEL: workgroup_acquire_fence:
8686
; GFX12-CU: ; %bb.0: ; %entry
87-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
8887
; GFX12-CU-NEXT: s_endpgm
8988
entry:
9089
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-as", !"global"}
@@ -159,7 +158,6 @@ define amdgpu_kernel void @workgroup_release_fence() {
159158
;
160159
; GFX12-CU-LABEL: workgroup_release_fence:
161160
; GFX12-CU: ; %bb.0: ; %entry
162-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
163161
; GFX12-CU-NEXT: s_endpgm
164162
entry:
165163
fence syncscope("workgroup") release, !mmra !{!"amdgpu-as", !"global"}
@@ -239,7 +237,6 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
239237
;
240238
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
241239
; GFX12-CU: ; %bb.0: ; %entry
242-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
243240
; GFX12-CU-NEXT: s_endpgm
244241
entry:
245242
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-as", !"global"}
@@ -319,7 +316,6 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
319316
;
320317
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
321318
; GFX12-CU: ; %bb.0: ; %entry
322-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
323319
; GFX12-CU-NEXT: s_endpgm
324320
entry:
325321
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-as", !"global"}
@@ -397,7 +393,6 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
397393
;
398394
; GFX12-CU-LABEL: workgroup_one_as_acquire_fence:
399395
; GFX12-CU: ; %bb.0: ; %entry
400-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
401396
; GFX12-CU-NEXT: s_endpgm
402397
entry:
403398
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-as", !"global"}
@@ -472,7 +467,6 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
472467
;
473468
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
474469
; GFX12-CU: ; %bb.0: ; %entry
475-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
476470
; GFX12-CU-NEXT: s_endpgm
477471
entry:
478472
fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-as", !"global"}
@@ -552,7 +546,6 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
552546
;
553547
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
554548
; GFX12-CU: ; %bb.0: ; %entry
555-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
556549
; GFX12-CU-NEXT: s_endpgm
557550
entry:
558551
fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-as", !"global"}
@@ -632,7 +625,6 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
632625
;
633626
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
634627
; GFX12-CU: ; %bb.0: ; %entry
635-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
636628
; GFX12-CU-NEXT: s_endpgm
637629
entry:
638630
fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-as", !"global"}

llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
7171
;
7272
; GFX12-WGP-LABEL: workgroup_acquire_fence:
7373
; GFX12-WGP: ; %bb.0: ; %entry
74-
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x3f00
74+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
7575
; GFX12-WGP-NEXT: s_endpgm
7676
;
7777
; GFX12-CU-LABEL: workgroup_acquire_fence:
7878
; GFX12-CU: ; %bb.0: ; %entry
79-
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x3f00
79+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
8080
; GFX12-CU-NEXT: s_endpgm
8181
entry:
8282
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-as", !"local"}
@@ -345,12 +345,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
345345
;
346346
; GFX12-WGP-LABEL: workgroup_one_as_acquire_fence:
347347
; GFX12-WGP: ; %bb.0: ; %entry
348-
; GFX12-WGP-NEXT: s_wait_loadcnt 0x3f
349348
; GFX12-WGP-NEXT: s_endpgm
350349
;
351350
; GFX12-CU-LABEL: workgroup_one_as_acquire_fence:
352351
; GFX12-CU: ; %bb.0: ; %entry
353-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
354352
; GFX12-CU-NEXT: s_endpgm
355353
entry:
356354
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
@@ -619,12 +617,12 @@ define amdgpu_kernel void @agent_acquire_fence() {
619617
;
620618
; GFX12-WGP-LABEL: agent_acquire_fence:
621619
; GFX12-WGP: ; %bb.0: ; %entry
622-
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x3f00
620+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
623621
; GFX12-WGP-NEXT: s_endpgm
624622
;
625623
; GFX12-CU-LABEL: agent_acquire_fence:
626624
; GFX12-CU: ; %bb.0: ; %entry
627-
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x3f00
625+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
628626
; GFX12-CU-NEXT: s_endpgm
629627
entry:
630628
fence syncscope("agent") acquire, !mmra !{!"amdgpu-as", !"local"}
@@ -893,12 +891,10 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
893891
;
894892
; GFX12-WGP-LABEL: agent_one_as_acquire_fence:
895893
; GFX12-WGP: ; %bb.0: ; %entry
896-
; GFX12-WGP-NEXT: s_wait_loadcnt 0x3f
897894
; GFX12-WGP-NEXT: s_endpgm
898895
;
899896
; GFX12-CU-LABEL: agent_one_as_acquire_fence:
900897
; GFX12-CU: ; %bb.0: ; %entry
901-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
902898
; GFX12-CU-NEXT: s_endpgm
903899
entry:
904900
fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
@@ -1167,12 +1163,12 @@ define amdgpu_kernel void @system_acquire_fence() {
11671163
;
11681164
; GFX12-WGP-LABEL: system_acquire_fence:
11691165
; GFX12-WGP: ; %bb.0: ; %entry
1170-
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x3f00
1166+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
11711167
; GFX12-WGP-NEXT: s_endpgm
11721168
;
11731169
; GFX12-CU-LABEL: system_acquire_fence:
11741170
; GFX12-CU: ; %bb.0: ; %entry
1175-
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x3f00
1171+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
11761172
; GFX12-CU-NEXT: s_endpgm
11771173
entry:
11781174
fence acquire, !mmra !{!"amdgpu-as", !"local"}
@@ -1441,12 +1437,10 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
14411437
;
14421438
; GFX12-WGP-LABEL: system_one_as_acquire_fence:
14431439
; GFX12-WGP: ; %bb.0: ; %entry
1444-
; GFX12-WGP-NEXT: s_wait_loadcnt 0x3f
14451440
; GFX12-WGP-NEXT: s_endpgm
14461441
;
14471442
; GFX12-CU-LABEL: system_one_as_acquire_fence:
14481443
; GFX12-CU: ; %bb.0: ; %entry
1449-
; GFX12-CU-NEXT: s_wait_loadcnt 0x3f
14501444
; GFX12-CU-NEXT: s_endpgm
14511445
entry:
14521446
fence syncscope("one-as") acquire, !mmra !{!"amdgpu-as", !"local"}

0 commit comments

Comments
 (0)