@@ -338,8 +338,8 @@ class WaitcntBrackets {
338
338
const MachineOperand &Op) const ;
339
339
340
340
bool counterOutOfOrder (InstCounterType T) const ;
341
- void simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
342
- void simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
341
+ void simplifyWaitcnt (AMDGPU::Waitcnt &Wait, bool OptNone ) const ;
342
+ void simplifyWaitcnt (InstCounterType T, unsigned &Count, bool OptNone ) const ;
343
343
344
344
void determineWait (InstCounterType T, RegInterval Interval,
345
345
AMDGPU::Waitcnt &Wait) const ;
@@ -1164,22 +1164,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
1164
1164
1165
1165
// / Simplify the waitcnt, in the sense of removing redundant counts, and return
1166
1166
// / whether a waitcnt instruction is needed at all.
1167
- void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const {
1168
- simplifyWaitcnt (LOAD_CNT, Wait.LoadCnt );
1169
- simplifyWaitcnt (EXP_CNT, Wait.ExpCnt );
1170
- simplifyWaitcnt (DS_CNT, Wait.DsCnt );
1171
- simplifyWaitcnt (STORE_CNT, Wait.StoreCnt );
1172
- simplifyWaitcnt (SAMPLE_CNT, Wait.SampleCnt );
1173
- simplifyWaitcnt (BVH_CNT, Wait.BvhCnt );
1174
- simplifyWaitcnt (KM_CNT, Wait.KmCnt );
1175
- simplifyWaitcnt (X_CNT, Wait.XCnt );
1167
+ void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait,
1168
+ bool OptNone) const {
1169
+ simplifyWaitcnt (LOAD_CNT, Wait.LoadCnt , OptNone);
1170
+ simplifyWaitcnt (EXP_CNT, Wait.ExpCnt , OptNone);
1171
+ simplifyWaitcnt (DS_CNT, Wait.DsCnt , OptNone);
1172
+ simplifyWaitcnt (STORE_CNT, Wait.StoreCnt , OptNone);
1173
+ simplifyWaitcnt (SAMPLE_CNT, Wait.SampleCnt , OptNone);
1174
+ simplifyWaitcnt (BVH_CNT, Wait.BvhCnt , OptNone);
1175
+ simplifyWaitcnt (KM_CNT, Wait.KmCnt , OptNone);
1176
+ simplifyWaitcnt (X_CNT, Wait.XCnt , OptNone);
1176
1177
}
1177
1178
1178
- void WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
1179
- unsigned &Count ) const {
1179
+ void WaitcntBrackets::simplifyWaitcnt (InstCounterType T, unsigned &Count,
1180
+ bool OptNone ) const {
1180
1181
// The number of outstanding events for this type, T, can be calculated
1181
1182
// as (UB - LB). If the current Count is greater than or equal to the number
1182
1183
// of outstanding events, then the wait for this counter is redundant.
1184
+ //
1185
+ // For counts that are at max value or above, try this even when optimizations
1186
+ // are disabled. This helps remove max waitcnt's that are inserted by the
1187
+ // memory legalizer by default, but does not optimize actual waitcnt's that
1188
+ // are otherwise inserted by the memory legalizer or a previous pass of the
1189
+ // inserter. The corner case is when a max waitcnt was optimized away although
1190
+ // it was not just a default, but was deliberately chosen. This only
1191
+ // marginally affects the usefulness of OptNone.
1192
+ if (Count < getWaitCountMax (T) && OptNone)
1193
+ return ;
1183
1194
if (Count >= getScoreRange (T))
1184
1195
Count = ~0u ;
1185
1196
}
@@ -1363,19 +1374,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1363
1374
}
1364
1375
1365
1376
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode (II.getOpcode ());
1366
- bool TrySimplify = Opcode != II.getOpcode () && !OptNone ;
1377
+ bool OpcodeIsSoft = Opcode != II.getOpcode ();
1367
1378
1368
1379
// Update required wait count. If this is a soft waitcnt (= it was added
1369
1380
// by an earlier pass), it may be entirely removed.
1370
1381
if (Opcode == AMDGPU::S_WAITCNT) {
1371
1382
unsigned IEnc = II.getOperand (0 ).getImm ();
1372
1383
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt (IV, IEnc);
1373
- if (TrySimplify )
1374
- ScoreBrackets.simplifyWaitcnt (OldWait);
1384
+ if (OpcodeIsSoft )
1385
+ ScoreBrackets.simplifyWaitcnt (OldWait, OptNone );
1375
1386
Wait = Wait.combined (OldWait);
1376
1387
1377
1388
// Merge consecutive waitcnt of the same type by erasing multiples.
1378
- if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt () && TrySimplify)) {
1389
+ if (WaitcntInstr ||
1390
+ (!Wait.hasWaitExceptStoreCnt () && OpcodeIsSoft && !OptNone)) {
1379
1391
II.eraseFromParent ();
1380
1392
Modified = true ;
1381
1393
} else
@@ -1386,11 +1398,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1386
1398
1387
1399
unsigned OldVSCnt =
1388
1400
TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
1389
- if (TrySimplify)
1390
- ScoreBrackets.simplifyWaitcnt (InstCounterType::STORE_CNT, OldVSCnt);
1401
+ if (OpcodeIsSoft)
1402
+ ScoreBrackets.simplifyWaitcnt (InstCounterType::STORE_CNT, OldVSCnt,
1403
+ OptNone);
1391
1404
Wait.StoreCnt = std::min (Wait.StoreCnt , OldVSCnt);
1392
1405
1393
- if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt () && TrySimplify)) {
1406
+ if (WaitcntVsCntInstr ||
1407
+ (!Wait.hasWaitStoreCnt () && OpcodeIsSoft && !OptNone)) {
1394
1408
II.eraseFromParent ();
1395
1409
Modified = true ;
1396
1410
} else
@@ -1528,7 +1542,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1528
1542
// by an earlier pass), it may be entirely removed.
1529
1543
1530
1544
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode (II.getOpcode ());
1531
- bool TrySimplify = Opcode != II.getOpcode () && !OptNone ;
1545
+ bool OpcodeIsSoft = Opcode != II.getOpcode ();
1532
1546
1533
1547
// Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1534
1548
// attempt to do more than that either.
@@ -1539,25 +1553,25 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1539
1553
unsigned OldEnc =
1540
1554
TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
1541
1555
AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt (IV, OldEnc);
1542
- if (TrySimplify )
1543
- ScoreBrackets.simplifyWaitcnt (OldWait);
1556
+ if (OpcodeIsSoft )
1557
+ ScoreBrackets.simplifyWaitcnt (OldWait, OptNone );
1544
1558
Wait = Wait.combined (OldWait);
1545
1559
UpdatableInstr = &CombinedLoadDsCntInstr;
1546
1560
} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1547
1561
unsigned OldEnc =
1548
1562
TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
1549
1563
AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt (IV, OldEnc);
1550
- if (TrySimplify )
1551
- ScoreBrackets.simplifyWaitcnt (OldWait);
1564
+ if (OpcodeIsSoft )
1565
+ ScoreBrackets.simplifyWaitcnt (OldWait, OptNone );
1552
1566
Wait = Wait.combined (OldWait);
1553
1567
UpdatableInstr = &CombinedStoreDsCntInstr;
1554
1568
} else {
1555
1569
std::optional<InstCounterType> CT = counterTypeForInstr (Opcode);
1556
1570
assert (CT.has_value ());
1557
1571
unsigned OldCnt =
1558
1572
TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
1559
- if (TrySimplify )
1560
- ScoreBrackets.simplifyWaitcnt (CT.value (), OldCnt);
1573
+ if (OpcodeIsSoft )
1574
+ ScoreBrackets.simplifyWaitcnt (CT.value (), OldCnt, OptNone );
1561
1575
addWait (Wait, CT.value (), OldCnt);
1562
1576
UpdatableInstr = &WaitInstrs[CT.value ()];
1563
1577
}
@@ -2009,7 +2023,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
2009
2023
}
2010
2024
2011
2025
// Verify that the wait is actually needed.
2012
- ScoreBrackets.simplifyWaitcnt (Wait);
2026
+ ScoreBrackets.simplifyWaitcnt (Wait, /* OptNone = */ false );
2013
2027
2014
2028
// When forcing emit, we need to skip terminators because that would break the
2015
2029
// terminators of the MBB if we emit a waitcnt between terminators.
@@ -2238,7 +2252,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2238
2252
NeedsEndPGMCheck = true ;
2239
2253
}
2240
2254
2241
- ScoreBrackets.simplifyWaitcnt (Wait);
2255
+ ScoreBrackets.simplifyWaitcnt (Wait, /* OptNone = */ false );
2242
2256
2243
2257
auto SuccessorIt = std::next (Inst.getIterator ());
2244
2258
bool Result = generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
0 commit comments