@@ -104,25 +104,38 @@ struct HardwareLimits {
104
104
unsigned KmcntMax; // gfx12+ only.
105
105
};
106
106
107
+ #define AMDGPU_DECLARE_WAIT_EVENTS (DECL ) \
108
+ DECL (VMEM_ACCESS) /* vmem read & write */ \
109
+ DECL (VMEM_READ_ACCESS) /* vmem read */ \
110
+ DECL (VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
111
+ DECL (VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
112
+ DECL (VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
113
+ DECL (SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
114
+ DECL (LDS_ACCESS) /* lds read & write */ \
115
+ DECL (GDS_ACCESS) /* gds read & write */ \
116
+ DECL (SQ_MESSAGE) /* send message */ \
117
+ DECL (SMEM_ACCESS) /* scalar-memory read & write */ \
118
+ DECL (EXP_GPR_LOCK) /* export holding on its data src */ \
119
+ DECL (GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
120
+ DECL (EXP_POS_ACCESS) /* write to export position */ \
121
+ DECL (EXP_PARAM_ACCESS) /* write to export parameter */ \
122
+ DECL (VMW_GPR_LOCK) /* vmem write holding on its data src */ \
123
+ DECL (EXP_LDS_ACCESS) /* read by ldsdir counting as export */
124
+
125
+ // clang-format off
126
+ #define AMDGPU_EVENT_ENUM (Name ) Name,
107
127
enum WaitEventType {
108
- VMEM_ACCESS, // vector-memory read & write
109
- VMEM_READ_ACCESS, // vector-memory read
110
- VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
111
- VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
112
- VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
113
- SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
114
- LDS_ACCESS, // lds read & write
115
- GDS_ACCESS, // gds read & write
116
- SQ_MESSAGE, // send message
117
- SMEM_ACCESS, // scalar-memory read & write
118
- EXP_GPR_LOCK, // export holding on its data src
119
- GDS_GPR_LOCK, // GDS holding on its data and addr src
120
- EXP_POS_ACCESS, // write to export position
121
- EXP_PARAM_ACCESS, // write to export parameter
122
- VMW_GPR_LOCK, // vector-memory write holding on its data src
123
- EXP_LDS_ACCESS, // read by ldsdir counting as export
124
- NUM_WAIT_EVENTS,
128
+ AMDGPU_DECLARE_WAIT_EVENTS (AMDGPU_EVENT_ENUM)
129
+ NUM_WAIT_EVENTS
125
130
};
131
+ #undef AMDGPU_EVENT_ENUM
132
+
133
+ #define AMDGPU_EVENT_NAME (Name ) #Name,
134
+ static constexpr StringLiteral WaitEventTypeName[] = {
135
+ AMDGPU_DECLARE_WAIT_EVENTS (AMDGPU_EVENT_NAME)
136
+ };
137
+ #undef AMDGPU_EVENT_NAME
138
+ // clang-format on
126
139
127
140
// The mapping is:
128
141
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
@@ -1100,6 +1113,20 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
1100
1113
}
1101
1114
OS << ' \n ' ;
1102
1115
}
1116
+
1117
+ OS << " Pending Events: " ;
1118
+ if (hasPendingEvent ()) {
1119
+ ListSeparator LS;
1120
+ for (unsigned I = 0 ; I != NUM_WAIT_EVENTS; ++I) {
1121
+ if (hasPendingEvent ((WaitEventType)I)) {
1122
+ OS << LS << WaitEventTypeName[I];
1123
+ }
1124
+ }
1125
+ } else {
1126
+ OS << " none" ;
1127
+ }
1128
+ OS << ' \n ' ;
1129
+
1103
1130
OS << ' \n ' ;
1104
1131
}
1105
1132
@@ -1265,10 +1292,15 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1265
1292
MachineInstr *WaitcntInstr = nullptr ;
1266
1293
MachineInstr *WaitcntVsCntInstr = nullptr ;
1267
1294
1295
+ LLVM_DEBUG (dbgs () << " PreGFX12::applyPreexistingWaitcnt at: " << *It);
1296
+
1268
1297
for (auto &II :
1269
1298
make_early_inc_range (make_range (OldWaitcntInstr.getIterator (), It))) {
1270
- if (II.isMetaInstruction ())
1299
+ LLVM_DEBUG (dbgs () << " pre-existing iter: " << II);
1300
+ if (II.isMetaInstruction ()) {
1301
+ LLVM_DEBUG (dbgs () << " skipped meta instruction\n " );
1271
1302
continue ;
1303
+ }
1272
1304
1273
1305
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode (II.getOpcode ());
1274
1306
bool TrySimplify = Opcode != II.getOpcode () && !OptNone;
@@ -1320,9 +1352,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1320
1352
1321
1353
LLVM_DEBUG (It == WaitcntInstr->getParent ()->end ()
1322
1354
? dbgs ()
1323
- << " applyPreexistingWaitcnt \n "
1355
+ << " applied pre-existing waitcnt \n "
1324
1356
<< " New Instr at block end: " << *WaitcntInstr << ' \n '
1325
- : dbgs () << " applyPreexistingWaitcnt \n "
1357
+ : dbgs () << " applied pre-existing waitcnt \n "
1326
1358
<< " Old Instr: " << *It
1327
1359
<< " New Instr: " << *WaitcntInstr << ' \n ' );
1328
1360
}
@@ -1336,10 +1368,10 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1336
1368
Wait.StoreCnt = ~0u ;
1337
1369
1338
1370
LLVM_DEBUG (It == WaitcntVsCntInstr->getParent ()->end ()
1339
- ? dbgs () << " applyPreexistingWaitcnt \n "
1371
+ ? dbgs () << " applied pre-existing waitcnt \n "
1340
1372
<< " New Instr at block end: " << *WaitcntVsCntInstr
1341
1373
<< ' \n '
1342
- : dbgs () << " applyPreexistingWaitcnt \n "
1374
+ : dbgs () << " applied pre-existing waitcnt \n "
1343
1375
<< " Old Instr: " << *It
1344
1376
<< " New Instr: " << *WaitcntVsCntInstr << ' \n ' );
1345
1377
}
@@ -1413,10 +1445,15 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1413
1445
MachineInstr *CombinedStoreDsCntInstr = nullptr ;
1414
1446
MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1415
1447
1448
+ LLVM_DEBUG (dbgs () << " GFX12Plus::applyPreexistingWaitcnt at: " << *It);
1449
+
1416
1450
for (auto &II :
1417
1451
make_early_inc_range (make_range (OldWaitcntInstr.getIterator (), It))) {
1418
- if (II.isMetaInstruction ())
1452
+ LLVM_DEBUG (dbgs () << " pre-existing iter: " << II);
1453
+ if (II.isMetaInstruction ()) {
1454
+ LLVM_DEBUG (dbgs () << " skipped meta instruction\n " );
1419
1455
continue ;
1456
+ }
1420
1457
1421
1458
MachineInstr **UpdatableInstr;
1422
1459
@@ -1486,10 +1523,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1486
1523
Wait.DsCnt = ~0u ;
1487
1524
1488
1525
LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
1489
- ? dbgs () << " applyPreexistingWaitcnt \n "
1526
+ ? dbgs () << " applied pre-existing waitcnt \n "
1490
1527
<< " New Instr at block end: "
1491
1528
<< *CombinedLoadDsCntInstr << ' \n '
1492
- : dbgs () << " applyPreexistingWaitcnt \n "
1529
+ : dbgs () << " applied pre-existing waitcnt \n "
1493
1530
<< " Old Instr: " << *It << " New Instr: "
1494
1531
<< *CombinedLoadDsCntInstr << ' \n ' );
1495
1532
} else {
@@ -1511,10 +1548,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1511
1548
Wait.DsCnt = ~0u ;
1512
1549
1513
1550
LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
1514
- ? dbgs () << " applyPreexistingWaitcnt \n "
1551
+ ? dbgs () << " applied pre-existing waitcnt \n "
1515
1552
<< " New Instr at block end: "
1516
1553
<< *CombinedStoreDsCntInstr << ' \n '
1517
- : dbgs () << " applyPreexistingWaitcnt \n "
1554
+ : dbgs () << " applied pre-existing waitcnt \n "
1518
1555
<< " Old Instr: " << *It << " New Instr: "
1519
1556
<< *CombinedStoreDsCntInstr << ' \n ' );
1520
1557
} else {
@@ -1570,10 +1607,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1570
1607
setNoWait (Wait, CT);
1571
1608
1572
1609
LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
1573
- ? dbgs () << " applyPreexistingWaitcnt \n "
1610
+ ? dbgs () << " applied pre-existing waitcnt \n "
1574
1611
<< " New Instr at block end: " << *WaitInstrs[CT]
1575
1612
<< ' \n '
1576
- : dbgs () << " applyPreexistingWaitcnt \n "
1613
+ : dbgs () << " applied pre-existing waitcnt \n "
1577
1614
<< " Old Instr: " << *It
1578
1615
<< " New Instr: " << *WaitInstrs[CT] << ' \n ' );
1579
1616
} else {
@@ -2306,7 +2343,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2306
2343
bool Modified = false ;
2307
2344
2308
2345
LLVM_DEBUG ({
2309
- dbgs () << " *** Block" << Block.getNumber () << " ***" ;
2346
+ dbgs () << " *** Begin Block: " ;
2347
+ Block.printName (dbgs ());
2310
2348
ScoreBrackets.dump ();
2311
2349
});
2312
2350
@@ -2437,6 +2475,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2437
2475
Modified |= generateWaitcnt (Wait, Block.instr_end (), Block, ScoreBrackets,
2438
2476
OldWaitcntInstr);
2439
2477
2478
+ LLVM_DEBUG ({
2479
+ dbgs () << " *** End Block: " ;
2480
+ Block.printName (dbgs ());
2481
+ ScoreBrackets.dump ();
2482
+ });
2483
+
2440
2484
return Modified;
2441
2485
}
2442
2486
@@ -2699,17 +2743,21 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
2699
2743
BlockInfo &SuccBI = SuccBII->second ;
2700
2744
if (!SuccBI.Incoming ) {
2701
2745
SuccBI.Dirty = true ;
2702
- if (SuccBII <= BII)
2746
+ if (SuccBII <= BII) {
2747
+ LLVM_DEBUG (dbgs () << " repeat on backedge\n " );
2703
2748
Repeat = true ;
2749
+ }
2704
2750
if (!MoveBracketsToSucc) {
2705
2751
MoveBracketsToSucc = &SuccBI;
2706
2752
} else {
2707
2753
SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2708
2754
}
2709
2755
} else if (SuccBI.Incoming ->merge (*Brackets)) {
2710
2756
SuccBI.Dirty = true ;
2711
- if (SuccBII <= BII)
2757
+ if (SuccBII <= BII) {
2758
+ LLVM_DEBUG (dbgs () << " repeat on backedge\n " );
2712
2759
Repeat = true ;
2760
+ }
2713
2761
}
2714
2762
}
2715
2763
if (MoveBracketsToSucc)
0 commit comments