Skip to content

Commit c0b82df

Browse files
authored
[MachinePipeliner] Add validation for missed loop-carried memory deps (#145878)
This patch adds an additional validation step to ensure that the generated schedule does not violate loop-carried memory dependencies. Prior to this patch, incorrect schedules could be produced due to the lack of checks for the following types of dependencies: - load-to-store backward (from bottom to top within the BB) dependencies - store-to-load dependencies - store-to-store dependencies One possible solution to this issue is to add these dependencies directly to the dependency graph, although doing so may lead to performance degradation. In addition, no known cases of incorrect code generation caused by these missing dependencies have been observed in practice. Given these factors, this patch introduces a post-scheduling validation phase to check for such previously missed dependencies, instead of adding them to the graph before searching for a schedule. Since no actual problems have been identified so far, it is likely that most generated schedules are already valid. Therefore, this additional validation is not expected to cause performance degradation in practice. Split off from #135148 . The remaining tasks are as follows: - Address other missing loop-carried dependencies (e.g., output dependencies between physical registers, barrier instructions, and instructions that may raise floating-point exceptions) - Remove code that are currently retained to maintain the existing behavior but probably unnecessary. - Eliminate `SwingSchedulerDAG::isLoopCarriedDep` and use `SwingSchedulerDDG` to traverse edges after dependency analysis part.
1 parent 6fc3b40 commit c0b82df

File tree

7 files changed

+204
-86
lines changed

7 files changed

+204
-86
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,17 @@ class SwingSchedulerDDGEdge {
120120
SUnit *Dst = nullptr;
121121
SDep Pred;
122122
unsigned Distance = 0;
123+
bool IsValidationOnly = false;
123124

124125
public:
125126
/// Creates an edge corresponding to an edge represented by \p PredOrSucc and
126127
/// \p Dep in the original DAG. This pair has no information about the
127128
/// direction of the edge, so we need to pass an additional argument \p
128129
/// IsSucc.
129-
SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc)
130-
: Dst(PredOrSucc), Pred(Dep), Distance(0u) {
130+
SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc,
131+
bool IsValidationOnly)
132+
: Dst(PredOrSucc), Pred(Dep), Distance(0u),
133+
IsValidationOnly(IsValidationOnly) {
131134
SUnit *Src = Dep.getSUnit();
132135

133136
if (IsSucc) {
@@ -188,6 +191,10 @@ class SwingSchedulerDDGEdge {
188191
/// functions. We ignore the back-edge recurrence in order to avoid unbounded
189192
/// recursion in the calculation of the ASAP, ALAP, etc functions.
190193
bool ignoreDependence(bool IgnoreAnti) const;
194+
195+
/// Returns true if this edge is intended to be used only for validating the
196+
/// schedule.
197+
bool isValidationOnly() const { return IsValidationOnly; }
191198
};
192199

193200
/// Represents loop-carried dependencies. Because SwingSchedulerDAG doesn't
@@ -208,25 +215,21 @@ struct LoopCarriedEdges {
208215
return &Ite->second;
209216
}
210217

211-
/// Retruns true if the edge from \p From to \p To is a back-edge that should
212-
/// be used when scheduling.
213-
bool shouldUseWhenScheduling(const SUnit *From, const SUnit *To) const;
214-
215218
/// Adds some edges to the original DAG that correspond to loop-carried
216219
/// dependencies. Historically, loop-carried edges are represented by using
217220
/// non-loop-carried edges in the original DAG. This function appends such
218221
/// edges to preserve the previous behavior.
219-
void modifySUnits(std::vector<SUnit> &SUnits);
222+
void modifySUnits(std::vector<SUnit> &SUnits, const TargetInstrInfo *TII);
220223

221224
void dump(SUnit *SU, const TargetRegisterInfo *TRI,
222225
const MachineRegisterInfo *MRI) const;
223226
};
224227

225-
/// Represents dependencies between instructions. This class is a wrapper of
226-
/// `SUnits` and its dependencies to manipulate back-edges in a natural way.
227-
/// Currently it only supports back-edges via PHI, which are expressed as
228-
/// anti-dependencies in the original DAG.
229-
/// FIXME: Support any other loop-carried dependencies
228+
/// This class provides APIs to retrieve edges from/to an SUnit node, with a
229+
/// particular focus on loop-carried dependencies. Since SUnit is not designed
230+
/// to represent such edges, handling them directly using its APIs has required
231+
/// non-trivial logic in the past. This class serves as a wrapper around SUnit,
232+
/// offering a simpler interface for managing these dependencies.
230233
class SwingSchedulerDDG {
231234
using EdgesType = SmallVector<SwingSchedulerDDGEdge, 4>;
232235

@@ -244,17 +247,26 @@ class SwingSchedulerDDG {
244247
SwingSchedulerDDGEdges EntrySUEdges;
245248
SwingSchedulerDDGEdges ExitSUEdges;
246249

250+
/// Edges that are used only when validating the schedule. These edges are
251+
/// not considered to drive the optimization heuristics.
252+
SmallVector<SwingSchedulerDDGEdge, 8> ValidationOnlyEdges;
253+
254+
/// Adds a NON-validation-only edge to the DDG. Assumes to be called only by
255+
/// the ctor.
247256
void addEdge(const SUnit *SU, const SwingSchedulerDDGEdge &Edge);
248257

249258
SwingSchedulerDDGEdges &getEdges(const SUnit *SU);
250259
const SwingSchedulerDDGEdges &getEdges(const SUnit *SU) const;
251260

252261
public:
253-
SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU);
262+
SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU,
263+
const LoopCarriedEdges &LCE);
254264

255265
const EdgesType &getInEdges(const SUnit *SU) const;
256266

257267
const EdgesType &getOutEdges(const SUnit *SU) const;
268+
269+
bool isValidSchedule(const SMSchedule &Schedule) const;
258270
};
259271

260272
/// This class builds the dependence graph for the instructions in a loop,

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 154 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,17 @@ class LoopCarriedOrderDepsTracker {
338338
void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
339339
const LoadStoreChunk &To);
340340

341+
/// Add a loop-carried order dependency between \p Src and \p Dst if we
342+
/// cannot prove they are independent. When \p PerformCheapCheck is true, a
343+
/// lightweight dependency test (referred to as "cheap check" below) is
344+
/// performed at first. Note that the cheap check is retained to maintain the
345+
/// existing behavior and not expected to be used anymore.
346+
///
347+
/// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
348+
void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
349+
const SUnitWithMemInfo &Dst,
350+
bool PerformCheapCheck = false);
351+
341352
void computeDependenciesAux();
342353
};
343354

@@ -673,7 +684,7 @@ void SwingSchedulerDAG::schedule() {
673684
Topo.InitDAGTopologicalSorting();
674685
changeDependences();
675686
postProcessDAG();
676-
DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
687+
DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
677688
LLVM_DEBUG({
678689
dump();
679690
dbgs() << "===== Loop Carried Edges Begin =====\n";
@@ -958,36 +969,44 @@ bool SUnitWithMemInfo::getUnderlyingObjects() {
958969

959970
/// Returns true if there is a loop-carried order dependency from \p Src to \p
960971
/// Dst.
961-
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
962-
const SUnitWithMemInfo &Dst,
963-
BatchAAResults &BAA,
964-
const TargetInstrInfo *TII,
965-
const TargetRegisterInfo *TRI) {
972+
static bool
973+
hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
974+
BatchAAResults &BAA, const TargetInstrInfo *TII,
975+
const TargetRegisterInfo *TRI,
976+
const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {
966977
if (Src.isTriviallyDisjoint(Dst))
967978
return false;
968979
if (isSuccOrder(Src.SU, Dst.SU))
969980
return false;
970981

971982
MachineInstr &SrcMI = *Src.SU->getInstr();
972983
MachineInstr &DstMI = *Dst.SU->getInstr();
973-
// First, perform the cheaper check that compares the base register.
974-
// If they are the same and the load offset is less than the store
975-
// offset, then mark the dependence as loop carried potentially.
976-
const MachineOperand *BaseOp1, *BaseOp2;
977-
int64_t Offset1, Offset2;
978-
bool Offset1IsScalable, Offset2IsScalable;
979-
if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
980-
TRI) &&
981-
TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
982-
TRI)) {
983-
if (BaseOp1->isIdenticalTo(*BaseOp2) &&
984-
Offset1IsScalable == Offset2IsScalable && (int)Offset1 < (int)Offset2) {
985-
assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
986-
"What happened to the chain edge?");
987-
return true;
984+
if (PerformCheapCheck) {
985+
// First, perform the cheaper check that compares the base register.
986+
// If they are the same and the load offset is less than the store
987+
// offset, then mark the dependence as loop carried potentially.
988+
//
989+
// TODO: This check will be removed.
990+
const MachineOperand *BaseOp1, *BaseOp2;
991+
int64_t Offset1, Offset2;
992+
bool Offset1IsScalable, Offset2IsScalable;
993+
if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
994+
TRI) &&
995+
TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
996+
TRI)) {
997+
if (BaseOp1->isIdenticalTo(*BaseOp2) &&
998+
Offset1IsScalable == Offset2IsScalable &&
999+
(int)Offset1 < (int)Offset2) {
1000+
assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
1001+
"What happened to the chain edge?");
1002+
return true;
1003+
}
9881004
}
9891005
}
9901006

1007+
if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1008+
return false;
1009+
9911010
// Second, the more expensive check that uses alias analysis on the
9921011
// base registers. If they alias, and the load offset is less than
9931012
// the store offset, the mark the dependence as loop carried.
@@ -1056,20 +1075,34 @@ LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
10561075
return std::nullopt;
10571076
}
10581077

1078+
void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1079+
const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1080+
bool PerformCheapCheck) {
1081+
// Avoid self-dependencies.
1082+
if (Src.SU == Dst.SU)
1083+
return;
1084+
1085+
if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1086+
LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1087+
}
1088+
10591089
void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
10601090
const LoadStoreChunk &From, const LoadStoreChunk &To) {
1061-
// Add dependencies for load-to-store (WAR) from top to bottom.
1091+
// Add load-to-store dependencies (WAR).
10621092
for (const SUnitWithMemInfo &Src : From.Loads)
10631093
for (const SUnitWithMemInfo &Dst : To.Stores)
1064-
if (Src.SU->NodeNum < Dst.SU->NodeNum &&
1065-
hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI))
1066-
LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1094+
// Perform a cheap check first if this is a forward dependency.
1095+
addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);
10671096

1068-
// TODO: The following dependencies are missed.
1069-
//
1070-
// - Dependencies for load-to-store from bottom to top.
1071-
// - Dependencies for store-to-load (RAW).
1072-
// - Dependencies for store-to-store (WAW).
1097+
// Add store-to-load dependencies (RAW).
1098+
for (const SUnitWithMemInfo &Src : From.Stores)
1099+
for (const SUnitWithMemInfo &Dst : To.Loads)
1100+
addDependenciesBetweenSUs(Src, Dst);
1101+
1102+
// Add store-to-store dependencies (WAW).
1103+
for (const SUnitWithMemInfo &Src : From.Stores)
1104+
for (const SUnitWithMemInfo &Dst : To.Stores)
1105+
addDependenciesBetweenSUs(Src, Dst);
10731106
}
10741107

10751108
void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
@@ -1116,7 +1149,7 @@ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
11161149
for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
11171150
LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
11181151

1119-
LCE.modifySUnits(SUnits);
1152+
LCE.modifySUnits(SUnits, TII);
11201153
return LCE;
11211154
}
11221155

@@ -2676,6 +2709,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
26762709
});
26772710
} while (++NI != NE && scheduleFound);
26782711

2712+
// If a schedule is found, validate it against the validation-only
2713+
// dependencies.
2714+
if (scheduleFound)
2715+
scheduleFound = DDG->isValidSchedule(Schedule);
2716+
26792717
// If a schedule is found, ensure non-pipelined instructions are in stage 0
26802718
if (scheduleFound)
26812719
scheduleFound =
@@ -4118,6 +4156,8 @@ SwingSchedulerDDG::getEdges(const SUnit *SU) const {
41184156

41194157
void SwingSchedulerDDG::addEdge(const SUnit *SU,
41204158
const SwingSchedulerDDGEdge &Edge) {
4159+
assert(!Edge.isValidationOnly() &&
4160+
"Validation-only edges are not expected here.");
41214161
auto &Edges = getEdges(SU);
41224162
if (Edge.getSrc() == SU)
41234163
Edges.Succs.push_back(Edge);
@@ -4127,25 +4167,43 @@ void SwingSchedulerDDG::addEdge(const SUnit *SU,
41274167

41284168
void SwingSchedulerDDG::initEdges(SUnit *SU) {
41294169
for (const auto &PI : SU->Preds) {
4130-
SwingSchedulerDDGEdge Edge(SU, PI, false);
4170+
SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4171+
/*IsValidationOnly=*/false);
41314172
addEdge(SU, Edge);
41324173
}
41334174

41344175
for (const auto &SI : SU->Succs) {
4135-
SwingSchedulerDDGEdge Edge(SU, SI, true);
4176+
SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4177+
/*IsValidationOnly=*/false);
41364178
addEdge(SU, Edge);
41374179
}
41384180
}
41394181

41404182
SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4141-
SUnit *ExitSU)
4183+
SUnit *ExitSU, const LoopCarriedEdges &LCE)
41424184
: EntrySU(EntrySU), ExitSU(ExitSU) {
41434185
EdgesVec.resize(SUnits.size());
41444186

4187+
// Add non-loop-carried edges based on the DAG.
41454188
initEdges(EntrySU);
41464189
initEdges(ExitSU);
41474190
for (auto &SU : SUnits)
41484191
initEdges(&SU);
4192+
4193+
// Add loop-carried edges, which are not represented in the DAG.
4194+
for (SUnit &SU : SUnits) {
4195+
SUnit *Src = &SU;
4196+
if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4197+
SDep Base(Src, SDep::Barrier);
4198+
Base.setLatency(1);
4199+
for (SUnit *Dst : *OD) {
4200+
SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4201+
/*IsValidationOnly=*/true);
4202+
Edge.setDistance(1);
4203+
ValidationOnlyEdges.push_back(Edge);
4204+
}
4205+
}
4206+
}
41494207
}
41504208

41514209
const SwingSchedulerDDG::EdgesType &
@@ -4158,17 +4216,73 @@ SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
41584216
return getEdges(SU).Succs;
41594217
}
41604218

4161-
void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) {
4162-
// Currently this function simply adds all dependencies represented by this
4163-
// object. After we properly handle missed dependencies, the logic here will
4164-
// be more complex, as currently missed edges should not be added to the DAG.
4219+
/// Check if \p Schedule doesn't violate the validation-only dependencies.
4220+
bool SwingSchedulerDDG::isValidSchedule(const SMSchedule &Schedule) const {
4221+
unsigned II = Schedule.getInitiationInterval();
4222+
4223+
auto ExpandCycle = [&](SUnit *SU) {
4224+
int Stage = Schedule.stageScheduled(SU);
4225+
int Cycle = Schedule.cycleScheduled(SU);
4226+
return Cycle + (Stage * II);
4227+
};
4228+
4229+
for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4230+
SUnit *Src = Edge.getSrc();
4231+
SUnit *Dst = Edge.getDst();
4232+
if (!Src->isInstr() || !Dst->isInstr())
4233+
continue;
4234+
int CycleSrc = ExpandCycle(Src);
4235+
int CycleDst = ExpandCycle(Dst);
4236+
int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4237+
if (CycleSrc > MaxLateStart) {
4238+
LLVM_DEBUG({
4239+
dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4240+
<< Dst->NodeNum << "\n";
4241+
});
4242+
return false;
4243+
}
4244+
}
4245+
return true;
4246+
}
4247+
4248+
void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4249+
const TargetInstrInfo *TII) {
41654250
for (SUnit &SU : SUnits) {
41664251
SUnit *Src = &SU;
41674252
if (auto *OrderDep = getOrderDepOrNull(Src)) {
41684253
SDep Dep(Src, SDep::Barrier);
41694254
Dep.setLatency(1);
4170-
for (SUnit *Dst : *OrderDep)
4171-
Dst->addPred(Dep);
4255+
for (SUnit *Dst : *OrderDep) {
4256+
SUnit *From = Src;
4257+
SUnit *To = Dst;
4258+
if (From->NodeNum > To->NodeNum)
4259+
std::swap(From, To);
4260+
4261+
// Add a forward edge if the following conditions are met:
4262+
//
4263+
// - The instruction of the source node (FromMI) may read memory.
4264+
// - The instruction of the target node (ToMI) may modify memory, but
4265+
// does not read it.
4266+
// - Neither instruction is a global barrier.
4267+
// - The load appears before the store in the original basic block.
4268+
// - There are no barrier or store instructions between the two nodes.
4269+
// - The target node is unreachable from the source node in the current
4270+
// DAG.
4271+
//
4272+
// TODO: These conditions are inherited from a previous implementation,
4273+
// and some may no longer be necessary. For now, we conservatively
4274+
// retain all of them to avoid regressions, but the logic could
4275+
// potentially be simplified
4276+
MachineInstr *FromMI = From->getInstr();
4277+
MachineInstr *ToMI = To->getInstr();
4278+
if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4279+
!TII->isGlobalMemoryObject(FromMI) &&
4280+
!TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4281+
SDep Pred = Dep;
4282+
Pred.setSUnit(Src);
4283+
Dst->addPred(Pred);
4284+
}
4285+
}
41724286
}
41734287
}
41744288
}

0 commit comments

Comments
 (0)