@@ -338,6 +338,17 @@ class LoopCarriedOrderDepsTracker {
338
338
void addLoopCarriedDepenenciesForChunks (const LoadStoreChunk &From,
339
339
const LoadStoreChunk &To);
340
340
341
+ // / Add a loop-carried order dependency between \p Src and \p Dst if we
342
+ // / cannot prove they are independent. When \p PerformCheapCheck is true, a
343
+ // / lightweight dependency test (referred to as "cheap check" below) is
344
+ // / performed at first. Note that the cheap check is retained to maintain the
345
+ // / existing behavior and not expected to be used anymore.
346
+ // /
347
+ // / TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
348
+ void addDependenciesBetweenSUs (const SUnitWithMemInfo &Src,
349
+ const SUnitWithMemInfo &Dst,
350
+ bool PerformCheapCheck = false );
351
+
341
352
void computeDependenciesAux ();
342
353
};
343
354
@@ -673,7 +684,7 @@ void SwingSchedulerDAG::schedule() {
673
684
Topo.InitDAGTopologicalSorting ();
674
685
changeDependences ();
675
686
postProcessDAG ();
676
- DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
687
+ DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE );
677
688
LLVM_DEBUG ({
678
689
dump ();
679
690
dbgs () << " ===== Loop Carried Edges Begin =====\n " ;
@@ -958,36 +969,44 @@ bool SUnitWithMemInfo::getUnderlyingObjects() {
958
969
959
970
// / Returns true if there is a loop-carried order dependency from \p Src to \p
960
971
// / Dst.
961
- static bool hasLoopCarriedMemDep ( const SUnitWithMemInfo &Src,
962
- const SUnitWithMemInfo &Dst,
963
- BatchAAResults &BAA,
964
- const TargetInstrInfo *TII ,
965
- const TargetRegisterInfo *TRI ) {
972
+ static bool
973
+ hasLoopCarriedMemDep ( const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
974
+ BatchAAResults &BAA, const TargetInstrInfo *TII ,
975
+ const TargetRegisterInfo *TRI ,
976
+ const SwingSchedulerDAG *SSD, bool PerformCheapCheck ) {
966
977
if (Src.isTriviallyDisjoint (Dst))
967
978
return false ;
968
979
if (isSuccOrder (Src.SU , Dst.SU ))
969
980
return false ;
970
981
971
982
MachineInstr &SrcMI = *Src.SU ->getInstr ();
972
983
MachineInstr &DstMI = *Dst.SU ->getInstr ();
973
- // First, perform the cheaper check that compares the base register.
974
- // If they are the same and the load offset is less than the store
975
- // offset, then mark the dependence as loop carried potentially.
976
- const MachineOperand *BaseOp1, *BaseOp2;
977
- int64_t Offset1, Offset2;
978
- bool Offset1IsScalable, Offset2IsScalable;
979
- if (TII->getMemOperandWithOffset (SrcMI, BaseOp1, Offset1, Offset1IsScalable,
980
- TRI) &&
981
- TII->getMemOperandWithOffset (DstMI, BaseOp2, Offset2, Offset2IsScalable,
982
- TRI)) {
983
- if (BaseOp1->isIdenticalTo (*BaseOp2) &&
984
- Offset1IsScalable == Offset2IsScalable && (int )Offset1 < (int )Offset2) {
985
- assert (TII->areMemAccessesTriviallyDisjoint (SrcMI, DstMI) &&
986
- " What happened to the chain edge?" );
987
- return true ;
984
+ if (PerformCheapCheck) {
985
+ // First, perform the cheaper check that compares the base register.
986
+ // If they are the same and the load offset is less than the store
987
+ // offset, then mark the dependence as loop carried potentially.
988
+ //
989
+ // TODO: This check will be removed.
990
+ const MachineOperand *BaseOp1, *BaseOp2;
991
+ int64_t Offset1, Offset2;
992
+ bool Offset1IsScalable, Offset2IsScalable;
993
+ if (TII->getMemOperandWithOffset (SrcMI, BaseOp1, Offset1, Offset1IsScalable,
994
+ TRI) &&
995
+ TII->getMemOperandWithOffset (DstMI, BaseOp2, Offset2, Offset2IsScalable,
996
+ TRI)) {
997
+ if (BaseOp1->isIdenticalTo (*BaseOp2) &&
998
+ Offset1IsScalable == Offset2IsScalable &&
999
+ (int )Offset1 < (int )Offset2) {
1000
+ assert (TII->areMemAccessesTriviallyDisjoint (SrcMI, DstMI) &&
1001
+ " What happened to the chain edge?" );
1002
+ return true ;
1003
+ }
988
1004
}
989
1005
}
990
1006
1007
+ if (!SSD->mayOverlapInLaterIter (&SrcMI, &DstMI))
1008
+ return false ;
1009
+
991
1010
// Second, the more expensive check that uses alias analysis on the
992
1011
// base registers. If they alias, and the load offset is less than
993
1012
// the store offset, the mark the dependence as loop carried.
@@ -1056,20 +1075,34 @@ LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1056
1075
return std::nullopt;
1057
1076
}
1058
1077
1078
+ void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs (
1079
+ const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1080
+ bool PerformCheapCheck) {
1081
+ // Avoid self-dependencies.
1082
+ if (Src.SU == Dst.SU )
1083
+ return ;
1084
+
1085
+ if (hasLoopCarriedMemDep (Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1086
+ LoopCarried[Src.SU ->NodeNum ].set (Dst.SU ->NodeNum );
1087
+ }
1088
+
1059
1089
void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks (
1060
1090
const LoadStoreChunk &From, const LoadStoreChunk &To) {
1061
- // Add dependencies for load-to-store (WAR) from top to bottom .
1091
+ // Add load-to-store dependencies (WAR).
1062
1092
for (const SUnitWithMemInfo &Src : From.Loads )
1063
1093
for (const SUnitWithMemInfo &Dst : To.Stores )
1064
- if (Src.SU ->NodeNum < Dst.SU ->NodeNum &&
1065
- hasLoopCarriedMemDep (Src, Dst, *BAA, TII, TRI))
1066
- LoopCarried[Src.SU ->NodeNum ].set (Dst.SU ->NodeNum );
1094
+ // Perform a cheap check first if this is a forward dependency.
1095
+ addDependenciesBetweenSUs (Src, Dst, Src.SU ->NodeNum < Dst.SU ->NodeNum );
1067
1096
1068
- // TODO: The following dependencies are missed.
1069
- //
1070
- // - Dependencies for load-to-store from bottom to top.
1071
- // - Dependencies for store-to-load (RAW).
1072
- // - Dependencies for store-to-store (WAW).
1097
+ // Add store-to-load dependencies (RAW).
1098
+ for (const SUnitWithMemInfo &Src : From.Stores )
1099
+ for (const SUnitWithMemInfo &Dst : To.Loads )
1100
+ addDependenciesBetweenSUs (Src, Dst);
1101
+
1102
+ // Add store-to-store dependencies (WAW).
1103
+ for (const SUnitWithMemInfo &Src : From.Stores )
1104
+ for (const SUnitWithMemInfo &Dst : To.Stores )
1105
+ addDependenciesBetweenSUs (Src, Dst);
1073
1106
}
1074
1107
1075
1108
void LoopCarriedOrderDepsTracker::computeDependenciesAux () {
@@ -1116,7 +1149,7 @@ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1116
1149
for (const int Succ : LCODTracker.getLoopCarried (I).set_bits ())
1117
1150
LCE.OrderDeps [&SUnits[I]].insert (&SUnits[Succ]);
1118
1151
1119
- LCE.modifySUnits (SUnits);
1152
+ LCE.modifySUnits (SUnits, TII );
1120
1153
return LCE;
1121
1154
}
1122
1155
@@ -2676,6 +2709,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2676
2709
});
2677
2710
} while (++NI != NE && scheduleFound);
2678
2711
2712
+ // If a schedule is found, validate it against the validation-only
2713
+ // dependencies.
2714
+ if (scheduleFound)
2715
+ scheduleFound = DDG->isValidSchedule (Schedule);
2716
+
2679
2717
// If a schedule is found, ensure non-pipelined instructions are in stage 0
2680
2718
if (scheduleFound)
2681
2719
scheduleFound =
@@ -4118,6 +4156,8 @@ SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4118
4156
4119
4157
void SwingSchedulerDDG::addEdge (const SUnit *SU,
4120
4158
const SwingSchedulerDDGEdge &Edge) {
4159
+ assert (!Edge.isValidationOnly () &&
4160
+ " Validation-only edges are not expected here." );
4121
4161
auto &Edges = getEdges (SU);
4122
4162
if (Edge.getSrc () == SU)
4123
4163
Edges.Succs .push_back (Edge);
@@ -4127,25 +4167,43 @@ void SwingSchedulerDDG::addEdge(const SUnit *SU,
4127
4167
4128
4168
void SwingSchedulerDDG::initEdges (SUnit *SU) {
4129
4169
for (const auto &PI : SU->Preds ) {
4130
- SwingSchedulerDDGEdge Edge (SU, PI, false );
4170
+ SwingSchedulerDDGEdge Edge (SU, PI, /* IsSucc=*/ false ,
4171
+ /* IsValidationOnly=*/ false );
4131
4172
addEdge (SU, Edge);
4132
4173
}
4133
4174
4134
4175
for (const auto &SI : SU->Succs ) {
4135
- SwingSchedulerDDGEdge Edge (SU, SI, true );
4176
+ SwingSchedulerDDGEdge Edge (SU, SI, /* IsSucc=*/ true ,
4177
+ /* IsValidationOnly=*/ false );
4136
4178
addEdge (SU, Edge);
4137
4179
}
4138
4180
}
4139
4181
4140
4182
SwingSchedulerDDG::SwingSchedulerDDG (std::vector<SUnit> &SUnits, SUnit *EntrySU,
4141
- SUnit *ExitSU)
4183
+ SUnit *ExitSU, const LoopCarriedEdges &LCE )
4142
4184
: EntrySU(EntrySU), ExitSU(ExitSU) {
4143
4185
EdgesVec.resize (SUnits.size ());
4144
4186
4187
+ // Add non-loop-carried edges based on the DAG.
4145
4188
initEdges (EntrySU);
4146
4189
initEdges (ExitSU);
4147
4190
for (auto &SU : SUnits)
4148
4191
initEdges (&SU);
4192
+
4193
+ // Add loop-carried edges, which are not represented in the DAG.
4194
+ for (SUnit &SU : SUnits) {
4195
+ SUnit *Src = &SU;
4196
+ if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull (Src)) {
4197
+ SDep Base (Src, SDep::Barrier);
4198
+ Base.setLatency (1 );
4199
+ for (SUnit *Dst : *OD) {
4200
+ SwingSchedulerDDGEdge Edge (Dst, Base, /* IsSucc=*/ false ,
4201
+ /* IsValidationOnly=*/ true );
4202
+ Edge.setDistance (1 );
4203
+ ValidationOnlyEdges.push_back (Edge);
4204
+ }
4205
+ }
4206
+ }
4149
4207
}
4150
4208
4151
4209
const SwingSchedulerDDG::EdgesType &
@@ -4158,17 +4216,73 @@ SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
4158
4216
return getEdges (SU).Succs ;
4159
4217
}
4160
4218
4161
- void LoopCarriedEdges::modifySUnits (std::vector<SUnit> &SUnits) {
4162
- // Currently this function simply adds all dependencies represented by this
4163
- // object. After we properly handle missed dependencies, the logic here will
4164
- // be more complex, as currently missed edges should not be added to the DAG.
4219
+ // / Check if \p Schedule doesn't violate the validation-only dependencies.
4220
+ bool SwingSchedulerDDG::isValidSchedule (const SMSchedule &Schedule) const {
4221
+ unsigned II = Schedule.getInitiationInterval ();
4222
+
4223
+ auto ExpandCycle = [&](SUnit *SU) {
4224
+ int Stage = Schedule.stageScheduled (SU);
4225
+ int Cycle = Schedule.cycleScheduled (SU);
4226
+ return Cycle + (Stage * II);
4227
+ };
4228
+
4229
+ for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4230
+ SUnit *Src = Edge.getSrc ();
4231
+ SUnit *Dst = Edge.getDst ();
4232
+ if (!Src->isInstr () || !Dst->isInstr ())
4233
+ continue ;
4234
+ int CycleSrc = ExpandCycle (Src);
4235
+ int CycleDst = ExpandCycle (Dst);
4236
+ int MaxLateStart = CycleDst + Edge.getDistance () * II - Edge.getLatency ();
4237
+ if (CycleSrc > MaxLateStart) {
4238
+ LLVM_DEBUG ({
4239
+ dbgs () << " Validation failed for edge from " << Src->NodeNum << " to "
4240
+ << Dst->NodeNum << " \n " ;
4241
+ });
4242
+ return false ;
4243
+ }
4244
+ }
4245
+ return true ;
4246
+ }
4247
+
4248
+ void LoopCarriedEdges::modifySUnits (std::vector<SUnit> &SUnits,
4249
+ const TargetInstrInfo *TII) {
4165
4250
for (SUnit &SU : SUnits) {
4166
4251
SUnit *Src = &SU;
4167
4252
if (auto *OrderDep = getOrderDepOrNull (Src)) {
4168
4253
SDep Dep (Src, SDep::Barrier);
4169
4254
Dep.setLatency (1 );
4170
- for (SUnit *Dst : *OrderDep)
4171
- Dst->addPred (Dep);
4255
+ for (SUnit *Dst : *OrderDep) {
4256
+ SUnit *From = Src;
4257
+ SUnit *To = Dst;
4258
+ if (From->NodeNum > To->NodeNum )
4259
+ std::swap (From, To);
4260
+
4261
+ // Add a forward edge if the following conditions are met:
4262
+ //
4263
+ // - The instruction of the source node (FromMI) may read memory.
4264
+ // - The instruction of the target node (ToMI) may modify memory, but
4265
+ // does not read it.
4266
+ // - Neither instruction is a global barrier.
4267
+ // - The load appears before the store in the original basic block.
4268
+ // - There are no barrier or store instructions between the two nodes.
4269
+ // - The target node is unreachable from the source node in the current
4270
+ // DAG.
4271
+ //
4272
+ // TODO: These conditions are inherited from a previous implementation,
4273
+ // and some may no longer be necessary. For now, we conservatively
4274
+ // retain all of them to avoid regressions, but the logic could
4275
+ // potentially be simplified
4276
+ MachineInstr *FromMI = From->getInstr ();
4277
+ MachineInstr *ToMI = To->getInstr ();
4278
+ if (FromMI->mayLoad () && !ToMI->mayLoad () && ToMI->mayStore () &&
4279
+ !TII->isGlobalMemoryObject (FromMI) &&
4280
+ !TII->isGlobalMemoryObject (ToMI) && !isSuccOrder (From, To)) {
4281
+ SDep Pred = Dep;
4282
+ Pred.setSUnit (Src);
4283
+ Dst->addPred (Pred);
4284
+ }
4285
+ }
4172
4286
}
4173
4287
}
4174
4288
}
0 commit comments