diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c93af749507f8..6a1ef71000b25 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -206,6 +206,11 @@ static cl::opt VectorizeNonPowerOf2( "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements.")); +static cl::opt VectorizeCopyableElements( + "slp-copyable-elements", cl::init(true), cl::Hidden, + cl::desc("Try to replace values with the idempotent instructions for " + "better vectorization.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -519,17 +524,17 @@ static bool isSplat(ArrayRef VL) { /// instructions, we need to use the converted opcode along with the original /// uses. /// \param I The instruction to check for commutativity -/// \param InstWithUses The instruction whose uses are analyzed for special +/// \param ValWithUses The value whose uses are analyzed for special /// patterns -static bool isCommutative(Instruction *I, Instruction *InstWithUses) { +static bool isCommutative(Instruction *I, Value *ValWithUses) { if (auto *Cmp = dyn_cast(I)) return Cmp->isCommutative(); if (auto *BO = dyn_cast(I)) return BO->isCommutative() || (BO->getOpcode() == Instruction::Sub && - !InstWithUses->hasNUsesOrMore(UsesLimit) && + !ValWithUses->hasNUsesOrMore(UsesLimit) && all_of( - InstWithUses->uses(), + ValWithUses->uses(), [](const Use &U) { // Commutative, if icmp eq/ne sub, 0 CmpPredicate Pred; @@ -546,8 +551,8 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) { Flag->isOne()); })) || (BO->getOpcode() == Instruction::FSub && - !InstWithUses->hasNUsesOrMore(UsesLimit) && - all_of(InstWithUses->uses(), [](const Use &U) { + !ValWithUses->hasNUsesOrMore(UsesLimit) && + all_of(ValWithUses->uses(), [](const Use &U) { return match(U.getUser(), m_Intrinsic(m_Specific(U.get()))); })); @@ -564,6 +569,19 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) { /// \returns true if the instruction is commutative, false otherwise static bool isCommutative(Instruction *I) { return isCommutative(I, I); } +/// \returns number of operands of \p I, considering commutativity. Returns 2 +/// for commutative instrinsics. +/// \param I The instruction to check for commutativity +static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) { + if (isa(I) && isCommutative(I)) { + // IntrinsicInst::isCommutative returns true if swapping the first "two" + // arguments to the intrinsic produces the same result. + constexpr unsigned IntrinsicNumOperands = 2; + return IntrinsicNumOperands; + } + return I->getNumOperands(); +} + template static std::optional getInsertExtractIndex(const Value *Inst, unsigned Offset) { @@ -855,6 +873,23 @@ static std::optional getExtractIndex(const Instruction *E) { return *EI->idx_begin(); } +namespace llvm { +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all operands are either not instructions +/// or phi nodes or instructions from different blocks. +static bool areAllOperandsNonInsts(Value *V); +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all users are phi nodes or instructions +/// from the different blocks. +static bool isUsedOutsideBlock(Value *V); +/// Checks if the specified value does not require scheduling. It does not +/// require scheduling if all operands and all users do not need to be scheduled +/// in the current basic block. +static bool doesNotNeedToBeScheduled(Value *V); +} // namespace llvm + namespace { /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. @@ -957,6 +992,31 @@ class BinOpSameOpcodeHelper { return Instruction::Xor; llvm_unreachable("Cannot find interchangeable instruction."); } + /// Return true if the \p Opcode is a candidate for interchange. + bool hasCandidateOpcode(unsigned Opcode) const { + MaskType Candidate = Mask & SeenBefore; + switch (Opcode) { + case Instruction::Shl: + return Candidate & ShlBIT; + case Instruction::AShr: + return Candidate & AShrBIT; + case Instruction::Mul: + return Candidate & MulBIT; + case Instruction::Add: + return Candidate & AddBIT; + case Instruction::Sub: + return Candidate & SubBIT; + case Instruction::And: + return Candidate & AndBIT; + case Instruction::Or: + return Candidate & OrBIT; + case Instruction::Xor: + return Candidate & XorBIT; + default: + break; + } + llvm_unreachable("Cannot find interchangeable instruction."); + } SmallVector getOperand(const Instruction *To) const { unsigned ToOpcode = To->getOpcode(); unsigned FromOpcode = I->getOpcode(); @@ -1117,6 +1177,10 @@ class BinOpSameOpcodeHelper { AltOp.trySet(OpcodeInMaskForm, InterchangeableMask)); } unsigned getMainOpcode() const { return MainOp.getOpcode(); } + /// Return true if the \p Opcode is a candidate for interchange. + bool hasCandidateOpcode(unsigned Opcode) const { + return MainOp.hasCandidateOpcode(Opcode); + } bool hasAltOp() const { return AltOp.I; } unsigned getAltOpcode() const { return hasAltOp() ? AltOp.getOpcode() : getMainOpcode(); @@ -1152,6 +1216,8 @@ class InstructionsState { /// GetVectorCost. Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; + /// Weather the instruction state represents copyable instructions. + bool HasCopyables = false; public: Instruction *getMainOp() const { @@ -1190,9 +1256,11 @@ class InstructionsState { if (!I->isBinaryOp()) return nullptr; BinOpSameOpcodeHelper Converter(MainOp); - if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp()) - return MainOp; - return AltOp; + if (!Converter.add(I) || !Converter.add(MainOp)) + return nullptr; + if (Converter.hasAltOp() && !isAltShuffle()) + return nullptr; + return Converter.hasAltOp() ? AltOp : MainOp; } /// Checks if main/alt instructions are shift operations. @@ -1237,9 +1305,67 @@ class InstructionsState { explicit operator bool() const { return valid(); } InstructionsState() = delete; - InstructionsState(Instruction *MainOp, Instruction *AltOp) - : MainOp(MainOp), AltOp(AltOp) {} + InstructionsState(Instruction *MainOp, Instruction *AltOp, + bool HasCopyables = false) + : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {} static InstructionsState invalid() { return {nullptr, nullptr}; } + + /// Checks if the value is a copyable element. + bool isCopyableElement(Value *V) const { + assert(valid() && "InstructionsState is invalid."); + if (!HasCopyables) + return false; + if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr) + return false; + auto *I = dyn_cast(V); + if (!I) + return !isa(V); + if (I->getParent() != MainOp->getParent() && + (!isVectorLikeInstWithConstOps(I) || + !isVectorLikeInstWithConstOps(MainOp))) + return true; + if (I->getOpcode() == MainOp->getOpcode()) + return false; + if (!I->isBinaryOp()) + return true; + BinOpSameOpcodeHelper Converter(MainOp); + return !Converter.add(I) || !Converter.add(MainOp) || + Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode()); + } + + /// Checks if the value is non-schedulable. + bool isNonSchedulable(Value *V) const { + assert(valid() && "InstructionsState is invalid."); + auto *I = dyn_cast(V); + if (!HasCopyables) + return !I || isa(I) || isVectorLikeInstWithConstOps(I) || + doesNotNeedToBeScheduled(V); + // MainOp for copyables always schedulable to correctly identify + // non-schedulable copyables. + if (getMainOp() == V) + return false; + if (isCopyableElement(V)) { + auto IsNonSchedulableCopyableElement = [this](Value *V) { + auto *I = dyn_cast(V); + return !I || isa(I) || I->getParent() != MainOp->getParent() || + (doesNotNeedToBeScheduled(I) && + // If the copyable instructions comes after MainOp + // (non-schedulable, but used in the block) - cannot vectorize + // it, will possibly generate use before def. + (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I))); + }; + + return IsNonSchedulableCopyableElement(V); + } + return !I || isa(I) || isVectorLikeInstWithConstOps(I) || + doesNotNeedToBeScheduled(V); + } + + /// Checks if the state represents copyable instructions. + bool areInstructionsWithCopyableElements() const { + assert(valid() && "InstructionsState is invalid."); + return HasCopyables; + } }; std::pair> @@ -1767,6 +1893,7 @@ class BoUpSLP { class TreeEntry; class ScheduleEntity; class ScheduleData; + class ScheduleCopyableData; class ScheduleBundle; class ShuffleCostEstimator; class ShuffleInstructionBuilder; @@ -2126,6 +2253,7 @@ class BoUpSLP { operator bool() const { return UserTE != nullptr; } }; + friend struct DenseMapInfo; /// A helper class used for scoring candidates for two consecutive lanes. class LookAheadHeuristics { @@ -2890,18 +3018,14 @@ class BoUpSLP { assert(S.valid() && "InstructionsState is invalid."); // IntrinsicInst::isCommutative returns true if swapping the first "two" // arguments to the intrinsic produces the same result. - constexpr unsigned IntrinsicNumOperands = 2; Instruction *MainOp = S.getMainOp(); unsigned NumOperands = MainOp->getNumOperands(); - ArgSize = isa(MainOp) ? IntrinsicNumOperands : NumOperands; + ArgSize = ::getNumberOfPotentiallyCommutativeOps(MainOp); OpsVec.resize(ArgSize); unsigned NumLanes = VL.size(); for (OperandDataVec &Ops : OpsVec) Ops.resize(NumLanes); for (unsigned Lane : seq(NumLanes)) { - Value *V = VL[Lane]; - assert((isa(V) || isa(V)) && - "Expected instruction or poison value"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of V and whether the operand at OpIdx is the LHS or RHS @@ -2912,17 +3036,24 @@ class BoUpSLP { // Since operand reordering is performed on groups of commutative // operations or alternating sequences (e.g., +, -), we can safely tell // the inverse operations by checking commutativity. - if (isa(V)) { + auto *I = dyn_cast(VL[Lane]); + if (!I && isa(VL[Lane])) { for (unsigned OpIdx : seq(NumOperands)) OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false}; continue; } - auto [SelectedOp, Ops] = convertTo(cast(V), S); - // We cannot check commutativity by the converted instruction - // (SelectedOp) because isCommutative also examines def-use - // relationships. - bool IsInverseOperation = - !isCommutative(SelectedOp, cast(V)); + bool IsInverseOperation = false; + if (S.isCopyableElement(VL[Lane])) { + // The value is a copyable element. + IsInverseOperation = !isCommutative(MainOp, VL[Lane]); + } else { + assert(I && "Expected instruction"); + auto [SelectedOp, Ops] = convertTo(I, S); + // We cannot check commutativity by the converted instruction + // (SelectedOp) because isCommutative also examines def-use + // relationships. + IsInverseOperation = !isCommutative(SelectedOp, I); + } for (unsigned OpIdx : seq(ArgSize)) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false}; @@ -3792,6 +3923,9 @@ class BoUpSLP { /// reordering of operands during buildTreeRec() and vectorizeTree(). SmallVector Operands; + /// Copyable elements of the entry node. + SmallPtrSet CopyableElements; + /// MainOp and AltOp are recorded inside. S should be obtained from /// newTreeEntry. InstructionsState S = InstructionsState::invalid(); @@ -3820,11 +3954,7 @@ class BoUpSLP { void setInterleave(unsigned Factor) { InterleaveFactor = Factor; } /// Marks the node as one that does not require scheduling. - void setDoesNotNeedToSchedule() { - assert(::doesNotNeedToSchedule(Scalars) && - "Expected to not need scheduling"); - DoesNotNeedToSchedule = true; - } + void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; } /// Returns true if the node is marked as one that does not require /// scheduling. bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; } @@ -3896,6 +4026,20 @@ class BoUpSLP { bool hasState() const { return S.valid(); } + /// Add \p V to the list of copyable elements. + void addCopyableElement(Value *V) { + assert(S.isCopyableElement(V) && "Not a copyable element."); + CopyableElements.insert(V); + } + + /// Returns true if \p V is a copyable element. + bool isCopyableElement(Value *V) const { + return CopyableElements.contains(V); + } + + /// Returns true if any scalar in the list is a copyable element. + bool hasCopyableElements() const { return !CopyableElements.empty(); } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. int findLaneForValue(Value *V) const { @@ -3968,6 +4112,8 @@ class BoUpSLP { for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; dbgs() << "State: "; + if (S && hasCopyableElements()) + dbgs() << "[[Copyable]] "; switch (State) { case Vectorize: if (InterleaveFactor > 0) { @@ -4145,12 +4291,20 @@ class BoUpSLP { } } } else if (!Last->isGather()) { - if (doesNotNeedToSchedule(VL)) + if (isa(S.getMainOp()) || + isVectorLikeInstWithConstOps(S.getMainOp()) || + (!S.areInstructionsWithCopyableElements() && + doesNotNeedToSchedule(VL)) || + all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) Last->setDoesNotNeedToSchedule(); SmallPtrSet Processed; for (Value *V : VL) { if (isa(V)) continue; + if (S.isCopyableElement(V)) { + Last->addCopyableElement(V); + continue; + } auto It = ScalarToTreeEntries.find(V); if (It == ScalarToTreeEntries.end()) { ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last); @@ -4162,16 +4316,14 @@ class BoUpSLP { } } // Update the scheduler bundle to point to this TreeEntry. - assert((!Bundle.getBundle().empty() || isa(S.getMainOp()) || - isVectorLikeInstWithConstOps(S.getMainOp()) || - Last->doesNotNeedToSchedule()) && + assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) && "Bundle and VL out of sync"); if (!Bundle.getBundle().empty()) { #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) auto *BundleMember = Bundle.getBundle().begin(); SmallPtrSet Processed; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second) + if (S.isNonSchedulable(V) || !Processed.insert(V).second) continue; ++BundleMember; } @@ -4280,7 +4432,8 @@ class BoUpSLP { /// in general. ScalarsVectorizationLegality getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const; + const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const; /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. @@ -4433,16 +4586,18 @@ class BoUpSLP { /// List of hashes of vector of loads, which are known to be non vectorizable. DenseSet ListOfKnonwnNonVectorizableLoads; - /// Represents a scheduling entity, either ScheduleData or ScheduleBundle. - /// ScheduleData used to gather dependecies for a single instructions, while - /// ScheduleBundle represents a batch of instructions, going to be groupped - /// together. + /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData + /// or ScheduleBundle. ScheduleData used to gather dependecies for a single + /// instructions, while ScheduleBundle represents a batch of instructions, + /// going to be groupped together. ScheduleCopyableData models extra user for + /// "copyable" instructions. class ScheduleEntity { friend class ScheduleBundle; friend class ScheduleData; + friend class ScheduleCopyableData; protected: - enum class Kind { ScheduleData, ScheduleBundle }; + enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData }; Kind getKind() const { return K; } ScheduleEntity(Kind K) : K(K) {} @@ -4461,17 +4616,79 @@ class BoUpSLP { void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; } int getSchedulingPriority() const { return SchedulingPriority; } bool isReady() const { - if (auto *SD = dyn_cast(this)) + if (const auto *SD = dyn_cast(this)) return SD->isReady(); + if (const auto *CD = dyn_cast(this)) + return CD->isReady(); return cast(this)->isReady(); } + /// Returns true if the dependency information has been calculated. + /// Note that depenendency validity can vary between instructions within + /// a single bundle. + bool hasValidDependencies() const { + if (const auto *SD = dyn_cast(this)) + return SD->hasValidDependencies(); + if (const auto *CD = dyn_cast(this)) + return CD->hasValidDependencies(); + return cast(this)->hasValidDependencies(); + } + /// Gets the number of unscheduled dependencies. + int getUnscheduledDeps() const { + if (const auto *SD = dyn_cast(this)) + return SD->getUnscheduledDeps(); + if (const auto *CD = dyn_cast(this)) + return CD->getUnscheduledDeps(); + return cast(this)->unscheduledDepsInBundle(); + } + /// Increments the number of unscheduled dependencies. + int incrementUnscheduledDeps(int Incr) { + if (auto *SD = dyn_cast(this)) + return SD->incrementUnscheduledDeps(Incr); + return cast(this)->incrementUnscheduledDeps(Incr); + } + /// Gets the number of dependencies. + int getDependencies() const { + if (const auto *SD = dyn_cast(this)) + return SD->getDependencies(); + return cast(this)->getDependencies(); + } + /// Gets the instruction. + Instruction *getInst() const { + if (const auto *SD = dyn_cast(this)) + return SD->getInst(); + return cast(this)->getInst(); + } + /// Gets/sets if the bundle is scheduled. bool isScheduled() const { return IsScheduled; } void setScheduled(bool Scheduled) { IsScheduled = Scheduled; } static bool classof(const ScheduleEntity *) { return true; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(raw_ostream &OS) const { + if (const auto *SD = dyn_cast(this)) + return SD->dump(OS); + if (const auto *CD = dyn_cast(this)) + return CD->dump(OS); + return cast(this)->dump(OS); + } + + LLVM_DUMP_METHOD void dump() const { + dump(dbgs()); + dbgs() << '\n'; + } +#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) }; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + friend inline raw_ostream &operator<<(raw_ostream &OS, + const BoUpSLP::ScheduleEntity &SE) { + SE.dump(OS); + return OS; + } +#endif + /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a @@ -4534,10 +4751,18 @@ class BoUpSLP { /// Clears all dependency information. void clearDependencies() { - Dependencies = InvalidDeps; - resetUnscheduledDeps(); + clearDirectDependencies(); MemoryDependencies.clear(); ControlDependencies.clear(); + } + + /// Clears all direct dependencies only, except for control and memory + /// dependencies. + /// Required for copyable elements to correctly handle control/memory deps + /// and avoid extra reclaculation of such deps. + void clearDirectDependencies() { + Dependencies = InvalidDeps; + resetUnscheduledDeps(); IsScheduled = false; } @@ -4627,7 +4852,7 @@ class BoUpSLP { class ScheduleBundle final : public ScheduleEntity { /// The schedule data for the instructions in the bundle. - SmallVector Bundle; + SmallVector Bundle; /// True if this bundle is valid. bool IsValid = true; /// The TreeEntry that this instruction corresponds to. @@ -4643,7 +4868,7 @@ class BoUpSLP { /// Verify basic self consistency properties void verify() const { - for (const ScheduleData *SD : Bundle) { + for (const ScheduleEntity *SD : Bundle) { if (SD->hasValidDependencies()) { assert(SD->getUnscheduledDeps() <= SD->getDependencies() && "invariant"); @@ -4663,7 +4888,7 @@ class BoUpSLP { int unscheduledDepsInBundle() const { assert(*this && "bundle must not be empty"); int Sum = 0; - for (const ScheduleData *BundleMember : Bundle) { + for (const ScheduleEntity *BundleMember : Bundle) { if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps) return ScheduleData::InvalidDeps; Sum += BundleMember->getUnscheduledDeps(); @@ -4675,7 +4900,7 @@ class BoUpSLP { /// Note that depenendency validity can vary between instructions within /// a single bundle. bool hasValidDependencies() const { - return all_of(Bundle, [](const ScheduleData *SD) { + return all_of(Bundle, [](const ScheduleEntity *SD) { return SD->hasValidDependencies(); }); } @@ -4689,10 +4914,10 @@ class BoUpSLP { /// Returns the bundle of scheduling data, associated with the current /// instruction. - ArrayRef getBundle() { return Bundle; } - ArrayRef getBundle() const { return Bundle; } + ArrayRef getBundle() { return Bundle; } + ArrayRef getBundle() const { return Bundle; } /// Adds an instruction to the bundle. - void add(ScheduleData *SD) { Bundle.push_back(SD); } + void add(ScheduleEntity *SD) { Bundle.push_back(SD); } /// Gets/sets the associated tree entry. void setTreeEntry(TreeEntry *TE) { this->TE = TE; } @@ -4709,8 +4934,11 @@ class BoUpSLP { return; } OS << '['; - interleaveComma(Bundle, OS, - [&](const ScheduleData *SD) { OS << *SD->getInst(); }); + interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) { + if (isa(SD)) + OS << ""; + OS << *SD->getInst(); + }); OS << ']'; } @@ -4729,6 +4957,129 @@ class BoUpSLP { } #endif + /// Contains all scheduling relevant data for the copyable instruction. + /// It models the virtual instructions, supposed to replace the original + /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0, + /// %1], where %1 = add, then the ScheduleCopyableData models virtual + /// instruction %virt = add %0, 0. + class ScheduleCopyableData final : public ScheduleEntity { + /// The source schedule data for the instruction. + Instruction *Inst = nullptr; + /// The edge information for the instruction. + const EdgeInfo EI; + /// This ScheduleData is in the current scheduling region if this matches + /// the current SchedulingRegionID of BlockScheduling. + int SchedulingRegionID = 0; + /// Bundle, this data is part of. + ScheduleBundle &Bundle; + + public: + ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I, + const EdgeInfo &EI, ScheduleBundle &Bundle) + : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI), + SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {} + static bool classof(const ScheduleEntity *Entity) { + return Entity->getKind() == Kind::ScheduleCopyableData; + } + + /// Verify basic self consistency properties + void verify() { + if (hasValidDependencies()) { + assert(UnscheduledDeps <= Dependencies && "invariant"); + } else { + assert(UnscheduledDeps == Dependencies && "invariant"); + } + + if (IsScheduled) { + assert(hasValidDependencies() && UnscheduledDeps == 0 && + "unexpected scheduled state"); + } + } + + /// Returns true if the dependency information has been calculated. + /// Note that depenendency validity can vary between instructions within + /// a single bundle. + bool hasValidDependencies() const { + return Dependencies != ScheduleData::InvalidDeps; + } + + /// Returns true if it is ready for scheduling, i.e. it has no more + /// unscheduled depending instructions/bundles. + bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; } + + /// Modifies the number of unscheduled dependencies for this instruction, + /// and returns the number of remaining dependencies for the containing + /// bundle. + int incrementUnscheduledDeps(int Incr) { + assert(hasValidDependencies() && + "increment of unscheduled deps would be meaningless"); + UnscheduledDeps += Incr; + assert(UnscheduledDeps >= 0 && "invariant"); + return UnscheduledDeps; + } + + /// Sets the number of unscheduled dependencies to the number of + /// dependencies. + void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; } + + /// Gets the number of unscheduled dependencies. + int getUnscheduledDeps() const { return UnscheduledDeps; } + /// Gets the number of dependencies. + int getDependencies() const { return Dependencies; } + /// Initializes the number of dependencies. + void initDependencies() { Dependencies = 0; } + /// Increments the number of dependencies. + void incDependencies() { Dependencies++; } + + /// Gets scheduling region ID. + int getSchedulingRegionID() const { return SchedulingRegionID; } + + /// Gets the instruction. + Instruction *getInst() const { return Inst; } + + /// Clears all dependency information. + void clearDependencies() { + Dependencies = ScheduleData::InvalidDeps; + UnscheduledDeps = ScheduleData::InvalidDeps; + IsScheduled = false; + } + + /// Gets the edge information. + const EdgeInfo &getEdgeInfo() const { return EI; } + + /// Gets the bundle. + ScheduleBundle &getBundle() { return Bundle; } + const ScheduleBundle &getBundle() const { return Bundle; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); } + + LLVM_DUMP_METHOD void dump() const { + dump(dbgs()); + dbgs() << '\n'; + } +#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + + private: + /// true, if it has valid dependency information. These nodes always have + /// only single dependency. + int Dependencies = ScheduleData::InvalidDeps; + + /// The number of dependencies minus the number of dependencies of scheduled + /// instructions. As soon as this is zero, the instruction/bundle gets ready + /// for scheduling. + /// Note that this is negative as long as Dependencies is not calculated. + int UnscheduledDeps = ScheduleData::InvalidDeps; + }; + +#ifndef NDEBUG + friend inline raw_ostream & + operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) { + SD.dump(OS); + return OS; + } +#endif + friend struct GraphTraits; friend struct DOTGraphTraits; @@ -4755,6 +5106,10 @@ class BoUpSLP { void clear() { ScheduledBundles.clear(); ScheduledBundlesList.clear(); + ScheduleCopyableDataMap.clear(); + ScheduleCopyableDataMapByInst.clear(); + ScheduleCopyableDataMapByInstUser.clear(); + ScheduleCopyableDataMapByUsers.clear(); ReadyInsts.clear(); ScheduleStart = nullptr; ScheduleEnd = nullptr; @@ -4781,7 +5136,7 @@ class BoUpSLP { // Avoid lookup if can't possibly be in map. return nullptr; ScheduleData *SD = ScheduleDataMap.lookup(I); - if (SD && isInSchedulingRegion(SD)) + if (SD && isInSchedulingRegion(*SD)) return SD; return nullptr; } @@ -4790,6 +5145,183 @@ class BoUpSLP { return getScheduleData(dyn_cast(V)); } + /// Returns the ScheduleCopyableData for the given edge (user tree entry and + /// operand number) and value. + ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI, + const Value *V) const { + auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V)); + if (It == ScheduleCopyableDataMap.end()) + return nullptr; + ScheduleCopyableData *SD = It->getSecond().get(); + if (!isInSchedulingRegion(*SD)) + return nullptr; + return SD; + } + + /// Returns the ScheduleCopyableData for the given user \p User, operand + /// number and operand \p V. + SmallVector + getScheduleCopyableData(const Value *User, unsigned OperandIdx, + const Value *V) { + const auto It = ScheduleCopyableDataMapByInstUser.find( + std::make_pair(std::make_pair(User, OperandIdx), V)); + if (It == ScheduleCopyableDataMapByInstUser.end()) + return {}; + SmallVector Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + /// Returns true if all operands of the given instruction \p User are + /// replaced by copyable data. + /// \param User The user instruction. + /// \param Op The operand, which might be replaced by the copyable data. + /// \param SLP The SLP tree. + /// \param NumOps The number of operands used. If the instruction uses the + /// same operand several times, check for the first use, then the second, + /// etc. + bool areAllOperandsReplacedByCopyableData(Instruction *User, + Instruction *Op, BoUpSLP &SLP, + unsigned NumOps) const { + assert(NumOps > 0 && "No operands"); + SmallDenseMap PotentiallyReorderedEntriesCount; + SmallDenseMap OrderedEntriesCount; + for (const Use &U : User->operands()) { + if (U.get() != Op) + continue; + ArrayRef Entries = SLP.getTreeEntries(User); + if (Entries.empty()) + return false; + // Check all tree entries, if they have operands replaced by copyable + // data. + for (TreeEntry *TE : SLP.getTreeEntries(User)) { + // Check if the user is commutative. + // The commutatives are handled later, as their oeprands can be + // reordered. + // Same applies even for non-commutative cmps, because we can invert + // their predicate potentially and, thus, reorder the operands. + bool IsCommutativeUser = + ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User); + EdgeInfo EI(TE, U.getOperandNo()); + if (!IsCommutativeUser && !isa(User)) { + unsigned &OpCnt = + OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); + if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) + return false; + // Found copyable operand - continue. + ++OpCnt; + continue; + } + ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0) + .first->getSecond(); + } + } + // Check the commutative/cmp entries. + if (!PotentiallyReorderedEntriesCount.empty()) { + for (auto &P : PotentiallyReorderedEntriesCount) { + auto *It = find(P.first->Scalars, User); + assert(It != P.first->Scalars.end() && + "User is not in the tree entry"); + int Lane = std::distance(P.first->Scalars.begin(), It); + assert(Lane >= 0 && "Lane is not found"); + if (isa(User) && !P.first->ReorderIndices.empty()) + Lane = P.first->ReorderIndices[Lane]; + assert(Lane < static_cast(P.first->Scalars.size()) && + "Couldn't find extract lane"); + SmallVector OpIndices; + for (unsigned OpIdx : + seq(::getNumberOfPotentiallyCommutativeOps( + P.first->getMainOp()))) { + if (P.first->getOperand(OpIdx)[Lane] == Op && + getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op)) + --P.getSecond(); + } + } + return all_of(PotentiallyReorderedEntriesCount, + [&](const std::pair &P) { + return P.second == NumOps - 1; + }); + } + return true; + } + + SmallVector + getScheduleCopyableData(const Instruction *I) const { + const auto It = ScheduleCopyableDataMapByInst.find(I); + if (It == ScheduleCopyableDataMapByInst.end()) + return {}; + SmallVector Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + SmallVector + getScheduleCopyableDataUsers(const Instruction *User) const { + const auto It = ScheduleCopyableDataMapByUsers.find(User); + if (It == ScheduleCopyableDataMapByUsers.end()) + return {}; + SmallVector Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI, + Instruction *I, + int SchedulingRegionID, + ScheduleBundle &Bundle) { + assert(!getScheduleCopyableData(EI, I) && "already in the map"); + ScheduleCopyableData *CD = + ScheduleCopyableDataMap + .try_emplace(std::make_pair(EI, I), + std::make_unique( + SchedulingRegionID, I, EI, Bundle)) + .first->getSecond() + .get(); + ScheduleCopyableDataMapByInst[I].push_back(CD); + if (EI.UserTE) { + ArrayRef Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, I); + assert(It != Op.end() && "Lane not set"); + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast(EI.UserTE->Scalars[Lane]); + ScheduleCopyableDataMapByInstUser + .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I)) + .first->getSecond() + .push_back(CD); + ScheduleCopyableDataMapByUsers.try_emplace(I) + .first->getSecond() + .insert(CD); + // Remove extra deps for users, becoming non-immediate users of the + // instruction. It may happen, if the chain of same copyable elements + // appears in the tree. + if (In == I) { + EdgeInfo UserEI = EI.UserTE->UserTreeIndex; + if (ScheduleCopyableData *UserCD = + getScheduleCopyableData(UserEI, In)) + ScheduleCopyableDataMapByUsers[I].remove(UserCD); + } + It = find(make_range(std::next(It), Op.end()), I); + } while (It != Op.end()); + } + return *CD; + } + ArrayRef getScheduleBundles(Value *V) const { auto *I = dyn_cast(V); if (!I) @@ -4800,34 +5332,44 @@ class BoUpSLP { return It->getSecond(); } - bool isInSchedulingRegion(ScheduleData *SD) const { - return SD->getSchedulingRegionID() == SchedulingRegionID; - } - - bool isInSchedulingRegion(const ScheduleBundle &Bundle) const { - return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) { - return BundleMember->getSchedulingRegionID() == SchedulingRegionID; - }); + /// Returns true if the entity is in the scheduling region. + bool isInSchedulingRegion(const ScheduleEntity &SD) const { + if (const auto *Data = dyn_cast(&SD)) + return Data->getSchedulingRegionID() == SchedulingRegionID; + if (const auto *CD = dyn_cast(&SD)) + return CD->getSchedulingRegionID() == SchedulingRegionID; + return all_of(cast(SD).getBundle(), + [&](const ScheduleEntity *BundleMember) { + return isInSchedulingRegion(*BundleMember); + }); } /// Marks an instruction as scheduled and puts all dependent ready /// instructions into the ready-list. template - void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) { - auto ProcessBundleMember = [&](ScheduleData *BundleMember, - ScheduleBundle *Bundle) { + void schedule(const BoUpSLP &R, const InstructionsState &S, + const EdgeInfo &EI, ScheduleEntity *Data, + ReadyListType &ReadyList) { + auto ProcessBundleMember = [&](ScheduleEntity *BundleMember, + ArrayRef Bundles) { // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. - auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) { + auto DecrUnsched = [&](ScheduleEntity *Data, bool IsControl = false) { if ((IsControl || Data->hasValidDependencies()) && Data->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after // decrementing, so we can put the dependent instruction // into the ready list. - if (ArrayRef Bundles = - getScheduleBundles(Data->getInst()); - !Bundles.empty()) { + SmallVector CopyableBundle; + ArrayRef Bundles; + if (auto *CD = dyn_cast(Data)) { + CopyableBundle.push_back(&CD->getBundle()); + Bundles = CopyableBundle; + } else { + Bundles = getScheduleBundles(Data->getInst()); + } + if (!Bundles.empty()) { for (ScheduleBundle *Bundle : Bundles) { if (Bundle->unscheduledDepsInBundle() == 0) { assert(!Bundle->isScheduled() && @@ -4841,12 +5383,21 @@ class BoUpSLP { } assert(!Data->isScheduled() && "already scheduled bundle gets ready"); + assert(!isa(Data) && + "Expected non-copyable data"); ReadyList.insert(Data); LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n"); } }; - auto DecrUnschedForInst = [&](Instruction *I) { + auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx, + Instruction *I) { + SmallVector CopyableData = + getScheduleCopyableData(User, OpIdx, I); + for (ScheduleCopyableData *CD : CopyableData) + DecrUnsched(CD, /*IsControl=*/false); + if (!CopyableData.empty()) + return; if (ScheduleData *OpSD = getScheduleData(I)) DecrUnsched(OpSD, /*IsControl=*/false); }; @@ -4854,45 +5405,92 @@ class BoUpSLP { // If BundleMember is a vector bundle, its operands may have been // reordered during buildTree(). We therefore need to get its operands // through the TreeEntry. - if (Bundle) { - // Need to search for the lane since the tree entry can be reordered. + if (!Bundles.empty()) { auto *In = BundleMember->getInst(); - int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), - find(Bundle->getTreeEntry()->Scalars, In)); - assert(Lane >= 0 && "Lane not set"); - - // Since vectorization tree is being built recursively this assertion - // ensures that the tree entry has all operands set before reaching - // this code. Couple of exceptions known at the moment are extracts - // where their second (immediate) operand is not added. Since - // immediates do not affect scheduler behavior this is considered - // okay. - assert(In && - (isa(In) || - In->getNumOperands() == - Bundle->getTreeEntry()->getNumOperands()) && - "Missed TreeEntry operands?"); - - for (unsigned OpIdx : - seq(Bundle->getTreeEntry()->getNumOperands())) - if (auto *I = dyn_cast( - Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { - LLVM_DEBUG(dbgs() - << "SLP: check for readiness (def): " << *I << "\n"); - DecrUnschedForInst(I); + // Count uses of each instruction operand. + SmallDenseMap OperandsUses; + if (isa(BundleMember)) { + // Copyable data is used only once (uses itself). + OperandsUses[In] = 1; + } else { + for (const Use &U : In->operands()) { + if (auto *I = dyn_cast(U.get())) + ++OperandsUses[I]; } + } + // Decrement the unscheduled counter and insert to ready list if + // ready. + auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE, + unsigned OpIdx) { + const EdgeInfo EI = {UserTE, OpIdx}; + if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) { + DecrUnsched(CD, /*IsControl=*/false); + return; + } + if (ScheduleData *OpSD = getScheduleData(I)) { + auto It = OperandsUses.find(I); + assert(It != OperandsUses.end() && "Operand not found"); + if (It->second > 0) { + DecrUnsched(OpSD, /*IsControl=*/false); + --It->getSecond(); + } + } + }; + + for (ScheduleBundle *Bundle : Bundles) { + // Need to search for the lane since the tree entry can be + // reordered. + int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), + find(Bundle->getTreeEntry()->Scalars, In)); + assert(Lane >= 0 && "Lane not set"); + if (isa(In) && + !Bundle->getTreeEntry()->ReorderIndices.empty()) + Lane = Bundle->getTreeEntry()->ReorderIndices[Lane]; + assert(Lane < static_cast( + Bundle->getTreeEntry()->Scalars.size()) && + "Couldn't find extract lane"); + + // Since vectorization tree is being built recursively this + // assertion ensures that the tree entry has all operands set before + // reaching this code. Couple of exceptions known at the moment are + // extracts where their second (immediate) operand is not added. + // Since immediates do not affect scheduler behavior this is + // considered okay. + assert(In && + (isa(In) || + In->getNumOperands() == + Bundle->getTreeEntry()->getNumOperands() || + Bundle->getTreeEntry()->isCopyableElement(In)) && + "Missed TreeEntry operands?"); + + for (unsigned OpIdx : + seq(Bundle->getTreeEntry()->getNumOperands())) + if (auto *I = dyn_cast( + Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { + LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I + << "\n"); + DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx); + } + } } else { // If BundleMember is a stand-alone instruction, no operand reordering // has taken place, so we directly access its operands. - for (Use &U : BundleMember->getInst()->operands()) + for (Use &U : BundleMember->getInst()->operands()) { if (auto *I = dyn_cast(U.get())) { LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I << "\n"); - DecrUnschedForInst(I); + DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I); } + } } // Handle the memory dependencies. - for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) { + auto *SD = dyn_cast(BundleMember); + if (!SD) + return; + SmallPtrSet VisitedMemory; + for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) { + if (!VisitedMemory.insert(MemoryDep).second) + continue; // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): " @@ -4900,7 +5498,10 @@ class BoUpSLP { DecrUnsched(MemoryDep); } // Handle the control dependencies. - for (ScheduleData *Dep : BundleMember->getControlDependencies()) { + SmallPtrSet VisitedControl; + for (ScheduleData *Dep : SD->getControlDependencies()) { + if (!VisitedControl.insert(Dep).second) + continue; // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. LLVM_DEBUG(dbgs() @@ -4911,12 +5512,14 @@ class BoUpSLP { if (auto *SD = dyn_cast(Data)) { SD->setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ProcessBundleMember(SD, nullptr); + ProcessBundleMember(SD, {}); } else { ScheduleBundle &Bundle = *cast(Data); Bundle.setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n"); - auto AreAllBundlesScheduled = [&](const ScheduleData *SD) { + auto AreAllBundlesScheduled = [&](const ScheduleEntity *SD) { + if (isa(SD)) + return true; ArrayRef SDBundles = getScheduleBundles(SD->getInst()); return !SDBundles.empty() && @@ -4924,10 +5527,12 @@ class BoUpSLP { return SDBundle->isScheduled(); }); }; - for (ScheduleData *SD : Bundle.getBundle()) { + for (ScheduleEntity *SD : Bundle.getBundle()) { if (AreAllBundlesScheduled(SD)) { SD->setScheduled(/*Scheduled=*/true); - ProcessBundleMember(SD, &Bundle); + ProcessBundleMember(SD, isa(SD) + ? &Bundle + : getScheduleBundles(SD->getInst())); } } } @@ -4955,7 +5560,7 @@ class BoUpSLP { auto *SD = getScheduleData(I); if (!SD) continue; - assert(isInSchedulingRegion(SD) && + assert(isInSchedulingRegion(*SD) && "primary schedule data not in window?"); SD->verify(); } @@ -4996,7 +5601,11 @@ class BoUpSLP { /// Build a bundle from the ScheduleData nodes corresponding to the /// scalar instruction for each lane. - ScheduleBundle &buildBundle(ArrayRef VL); + /// \param VL The list of scalar instructions. + /// \param S The state of the instructions. + /// \param EI The edge in the SLP graph or the user node/operand number. + ScheduleBundle &buildBundle(ArrayRef VL, + const InstructionsState &S, const EdgeInfo &EI); /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are @@ -5005,7 +5614,7 @@ class BoUpSLP { /// std::nullopt if \p VL is allowed to be scheduled. std::optional tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, - const InstructionsState &S); + const InstructionsState &S, const EdgeInfo &EI); /// Allocates schedule data chunk. ScheduleData *allocateScheduleDataChunks(); @@ -5045,6 +5654,48 @@ class BoUpSLP { /// ScheduleData structures are recycled. SmallDenseMap ScheduleDataMap; + /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand + /// number) and the operand instruction, represented as copyable element. + SmallDenseMap, + std::unique_ptr> + ScheduleCopyableDataMap; + + /// Represents mapping between instruction and all related + /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable + /// element). The SLP tree may contain several representations of the same + /// instruction. + SmallDenseMap> + ScheduleCopyableDataMapByInst; + + /// Represents mapping between user value and operand number, the operand + /// value and all related ScheduleCopyableData. The relation is 1:n, because + /// the same user may refernce the same operand in different tree entries + /// and the operand may be modelled by the different copyable data element. + SmallDenseMap, const Value *>, + SmallVector> + ScheduleCopyableDataMapByInstUser; + + /// Represents mapping between instruction and all related + /// ScheduleCopyableData. It represents the mapping between the actual + /// instruction and the last copyable data element in the chain. E.g., if + /// the graph models the following instructions: + /// %0 = non-add instruction ... + /// ... + /// %4 = add %3, 1 + /// %5 = add %4, 1 + /// %6 = insertelement poison, %0, 0 + /// %7 = insertelement %6, %5, 1 + /// And the graph is modeled as: + /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ] + /// -> [1, 0] -> [%1, 0] + /// + /// this map will map %0 only to the copyable element <1>, which is the last + /// user (direct user of the actual instruction). <0> uses <1>, so <1> will + /// keep the map to <0>, not the %0. + SmallDenseMap> + ScheduleCopyableDataMapByUsers; + /// Attaches ScheduleBundle to Instruction. SmallDenseMap> ScheduledBundles; @@ -5091,7 +5742,7 @@ class BoUpSLP { /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. - void scheduleBlock(BlockScheduling *BS); + void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS); /// List of users to ignore during scheduling and that don't need extracting. const SmallDenseSet *UserIgnoreList = nullptr; @@ -5164,6 +5815,30 @@ class BoUpSLP { } // end namespace slpvectorizer +template <> struct DenseMapInfo { + using FirstInfo = DenseMapInfo; + using SecondInfo = DenseMapInfo; + static BoUpSLP::EdgeInfo getEmptyKey() { + return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(), + SecondInfo::getEmptyKey()); + } + + static BoUpSLP::EdgeInfo getTombstoneKey() { + return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(), + SecondInfo::getTombstoneKey()); + } + + static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) { + return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE), + SecondInfo::getHashValue(Val.EdgeIdx)); + } + + static bool isEqual(const BoUpSLP::EdgeInfo &LHS, + const BoUpSLP::EdgeInfo &RHS) { + return LHS == RHS; + } +}; + template <> struct GraphTraits { using TreeEntry = BoUpSLP::TreeEntry; @@ -7891,7 +8566,7 @@ void BoUpSLP::buildExternalUses( // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - if (!isa(Scalar)) + if (!isa(Scalar) || Entry->isCopyableElement(Scalar)) continue; // All uses must be replaced already? No need to do it again. auto It = ScalarToExtUses.find(Scalar); @@ -9599,7 +10274,8 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, }))) { if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() && - all_of(UniqueValues, IsaPred)) { + (S.areInstructionsWithCopyableElements() || + all_of(UniqueValues, IsaPred))) { // Find the number of elements, which forms full vectors. unsigned PWSz = getFullVectorNumberOfElements( TTI, UniqueValues.front()->getType(), UniqueValues.size()); @@ -9616,9 +10292,10 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, PaddedUniqueValues.append( PWSz - UniqueValues.size(), PoisonValue::get(UniqueValues.front()->getType())); - // Check that extended with poisons operations are still valid for - // vectorization (div/rem are not allowed). - if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) { + // Check that extended with poisons/copyable operations are still valid + // for vectorization (div/rem are not allowed). + if (!S.areInstructionsWithCopyableElements() && + !getSameOpcode(PaddedUniqueValues, TLI).valid()) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); ReuseShuffleIndices.clear(); return false; @@ -9767,13 +10444,98 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, } namespace { -/// Class accepts incoming list of values and generates the list of values -/// for scheduling and list of operands for the new nodes. +/// Class accepts incoming list of values, checks if it is able to model +/// "copyable" values as compatible operations, and generates the list of values +/// for scheduling and list of operands doe the new nodes. class InstructionsCompatibilityAnalysis { DominatorTree &DT; const DataLayout &DL; const TargetTransformInfo &TTI; const TargetLibraryInfo &TLI; + unsigned MainOpcode = 0; + Instruction *MainOp = nullptr; + + /// Identifies the best candidate value, which represents main opcode + /// operation. + /// Currently the best candidate is the Add instruction with the parent + /// block with the highest DFS incoming number (block, that dominates other). + void findMainInstruction(ArrayRef VL) { + BasicBlock *Parent = nullptr; + // Checks if the instruction has supported opcode. + auto IsSupportedOpcode = [](Instruction *I) { + return I && I->getOpcode() == Instruction::Add; + }; + // Exclude operands instructions immediately to improve compile time, it + // will be unable to schedule anyway. + SmallDenseSet Operands; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (!DT.isReachableFromEntry(I->getParent())) + continue; + if (!MainOp) { + MainOp = I; + Parent = I->getParent(); + Operands.insert(I->op_begin(), I->op_end()); + continue; + } + if (Parent == I->getParent()) { + if (!IsSupportedOpcode(MainOp) && !Operands.contains(I)) + MainOp = I; + Operands.insert(I->op_begin(), I->op_end()); + continue; + } + auto *NodeA = DT.getNode(Parent); + auto *NodeB = DT.getNode(I->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == + (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) { + MainOp = I; + Parent = I->getParent(); + Operands.clear(); + Operands.insert(I->op_begin(), I->op_end()); + } + } + if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) { + MainOp = nullptr; + return; + } + MainOpcode = MainOp->getOpcode(); + } + + /// Returns the idempotent value for the \p MainOp with the detected \p + /// MainOpcode. For Add, returns 0. For Or, it should choose between false and + /// the operand itself, since V or V == V. + Value *selectBestIdempotentValue() const { + switch (MainOpcode) { + case Instruction::Add: + return ConstantInt::getNullValue(MainOp->getType()); + default: + break; + } + llvm_unreachable("Unsupported opcode"); + } + + /// Returns the value and operands for the \p V, considering if it is original + /// instruction and its actual operands should be returned, or it is a + /// copyable element and its should be represented as idempotent instruction. + SmallVector getOperands(const InstructionsState &S, Value *V) const { + if (isa(V)) + return {V, V}; + if (!S.isCopyableElement(V)) + return convertTo(cast(V), S).second; + switch (MainOpcode) { + case Instruction::Add: + return {V, selectBestIdempotentValue()}; + default: + break; + } + llvm_unreachable("Unsupported opcode"); + } /// Builds operands for the original instructions. void @@ -9934,22 +10696,145 @@ class InstructionsCompatibilityAnalysis { const TargetLibraryInfo &TLI) : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {} + InstructionsState + buildInstructionsState(ArrayRef VL, const BoUpSLP &R, + bool TryCopyableElementsVectorization, + bool WithProfitabilityCheck = false) { + InstructionsState S = getSameOpcode(VL, TLI); + if (S) + return S; + if (!VectorizeCopyableElements || !TryCopyableElementsVectorization) + return S; + findMainInstruction(VL); + if (!MainOp) + return InstructionsState::invalid(); + S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true); + if (!WithProfitabilityCheck) + return S; + // Check if it is profitable to vectorize the instruction. + SmallVector Operands = buildOperands(S, VL); + if (VL.size() == 2) { + // Check if the operands allow better vectorization. + SmallVector, 4> Candidates; + Candidates.emplace_back(Operands[0][0], Operands[0][1]); + Candidates.emplace_back(Operands[1][0], Operands[1][1]); + if (isCommutative(MainOp)) { + Candidates.emplace_back(Operands[0][0], Operands[1][1]); + Candidates.emplace_back(Operands[1][0], Operands[0][1]); + } + // No good candidates - not profitable. + if (!R.findBestRootPair(Candidates, + BoUpSLP::LookAheadHeuristics::ScoreSplat)) { + // Deeper analysis for 2 splats/constants. + SmallVector, 4> Candidates1, Candidates2; + Candidates1.emplace_back(Operands[0][0], Operands[0][1]); + Candidates2.emplace_back(Operands[1][0], Operands[1][1]); + bool Res = + R.findBestRootPair(Candidates1) && R.findBestRootPair(Candidates2); + if (!Res && isCommutative(MainOp)) { + Candidates1.clear(); + Candidates2.clear(); + Candidates1.emplace_back(Operands[0][0], Operands[1][1]); + Candidates2.emplace_back(Operands[1][0], Operands[0][1]); + Res = R.findBestRootPair(Candidates1) && + R.findBestRootPair(Candidates2); + } + if (!Res) + return InstructionsState::invalid(); + } + return S; + } + assert(Operands.size() == 2 && "Unexpected number of operands!"); + unsigned CopyableNum = + count_if(VL, [&](Value *V) { return S.isCopyableElement(V); }); + if (CopyableNum < VL.size() / 2) + return S; + // Check profitability if number of copyables > VL.size() / 2. + // 1. Reorder operands for better matching. + if (isCommutative(MainOp)) { + for (auto &Ops : Operands) { + // Make instructions the first operands. + if (!isa(Ops.front()) && isa(Ops.back())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + // Make constants the second operands. + if (isa(Ops.front())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + } + } + // 2. Check, if operands can be vectorized. + if (count_if(Operands.back(), IsaPred) > 1) + return InstructionsState::invalid(); + auto CheckOperand = [&](ArrayRef Ops) { + if (allConstant(Ops) || isSplat(Ops)) + return true; + // Check if it is "almost" splat, i.e. has >= 4 elements and only single + // one is different. + constexpr unsigned Limit = 4; + if (Operands.front().size() >= Limit) { + SmallDenseMap Counters; + for (Value *V : Ops) { + if (isa(V)) + continue; + ++Counters[V]; + } + if (Counters.size() == 2 && + any_of(Counters, [&](const std::pair &C) { + return C.second == 1; + })) + return true; + } + // First operand not a constant or splat? Last attempt - check for + // potential vectorization. + InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI); + InstructionsState OpS = Analysis.buildInstructionsState( + Ops, R, /*TryCopyableElementsVectorization=*/true); + if (!OpS) + return false; + unsigned CopyableNum = + count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); }); + return CopyableNum <= VL.size() / 2; + }; + if (!CheckOperand(Operands.front())) + return InstructionsState::invalid(); + + return S; + } + SmallVector buildOperands(const InstructionsState &S, ArrayRef VL) { assert(S && "Invalid state!"); SmallVector Operands; - buildOriginalOperands(S, VL, Operands); + if (S.areInstructionsWithCopyableElements()) { + MainOp = S.getMainOp(); + MainOpcode = S.getOpcode(); + Operands.assign(MainOp->getNumOperands(), + BoUpSLP::ValueList(VL.size(), nullptr)); + for (auto [Idx, V] : enumerate(VL)) { + SmallVector OperandsForValue = getOperands(S, V); + for (auto [OperandIdx, Operand] : enumerate(OperandsForValue)) + Operands[OperandIdx][Idx] = Operand; + } + } else { + buildOriginalOperands(S, VL, Operands); + } return Operands; } }; } // namespace -BoUpSLP::ScalarsVectorizationLegality -BoUpSLP::getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const { +BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( + ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); - InstructionsState S = getSameOpcode(VL, *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + VL, *this, TryCopyableElementsVectorization, + /*WithProfitabilityCheck=*/true); // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no @@ -10248,9 +11133,9 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, return true; }; - ScalarsVectorizationLegality Legality = - getScalarsVectorizationLegality(VL, Depth, UserTreeIdx); - const InstructionsState &S = Legality.getInstructionsState(); + ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false); + InstructionsState S = Legality.getInstructionsState(); if (!Legality.isLegal()) { if (Legality.trySplitVectorize()) { auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL); @@ -10258,11 +11143,18 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp))) return; } - if (Legality.tryToFindDuplicates()) - tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx); + if (!S) + Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true); + if (!Legality.isLegal()) { + if (Legality.tryToFindDuplicates()) + tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, + UserTreeIdx); - newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); - return; + newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); + return; + } + S = Legality.getInstructionsState(); } // FIXME: investigate if there are profitable cases for VL.size() <= 4. @@ -10299,7 +11191,7 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, SetVector UniqueValues(llvm::from_range, VL); std::optional BundlePtr = - BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S); + BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); @@ -13021,7 +13913,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && - E->getMainOp()->getType()->isPointerTy())) && + E->getMainOp()->getType()->isPointerTy()) || + E->hasCopyableElements()) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -13033,6 +13926,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallBitVector UsedScalars(Sz, false); for (unsigned I = 0; I < Sz; ++I) { if (isa(UniqueValues[I]) && + !E->isCopyableElement(UniqueValues[I]) && getTreeEntries(UniqueValues[I]).front() == E) continue; UsedScalars.set(I); @@ -14083,6 +14977,31 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { })) return true; + // If the tree contains only buildvector, 2 non-buildvectors (with root user + // tree node) and other buildvectors, we can skip it. + if (!ForReduction && SLPCostThreshold.getNumOccurrences() && + VectorizableTree.front()->State == TreeEntry::SplitVectorize && + VectorizableTree.size() >= Limit && + count_if(ArrayRef(VectorizableTree).drop_front(), + [&](const std::unique_ptr &TE) { + return !TE->isGather() && TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->Idx == 0; + }) == 2) + return true; + + // If the tree contains only vectorization of the phi node from the + // buildvector - skip it. + if (!ForReduction && SLPCostThreshold.getNumOccurrences() && + VectorizableTree.size() > 2 && + VectorizableTree.front()->State == TreeEntry::Vectorize && + VectorizableTree.front()->getOpcode() == Instruction::InsertElement && + VectorizableTree[1]->State == TreeEntry::Vectorize && + VectorizableTree[1]->getOpcode() == Instruction::PHI && + all_of( + ArrayRef(VectorizableTree).drop_front(2), + [&](const std::unique_ptr &TE) { return TE->isGather(); })) + return true; + // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -16063,6 +16982,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { auto *I = dyn_cast(V); if (!I) continue; + if (E->isCopyableElement(I)) + continue; if (FirstInst->getParent() == I->getParent()) { if (I->comesBefore(FirstInst)) FirstInst = I; @@ -16127,7 +17048,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return nullptr; for (Value *V : E->Scalars) { auto *I = dyn_cast(V); - if (!I || isa(I) || doesNotNeedToBeScheduled(I)) + if (!I || isa(I) || + (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I))) continue; ArrayRef Bundles = It->second->getScheduleBundles(I); if (Bundles.empty()) @@ -18575,7 +19497,7 @@ Value *BoUpSLP::vectorizeTree( EntryToLastInstruction.clear(); // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) - scheduleBlock(BSIter.second.get()); + scheduleBlock(*this, BSIter.second.get()); // Cache last instructions for the nodes to avoid side effects, which may // appear during vectorization, like extra uses, etc. for (const std::unique_ptr &TE : VectorizableTree) { @@ -19140,7 +20062,7 @@ Value *BoUpSLP::vectorizeTree( if (auto *EE = dyn_cast(Scalar); EE && IgnoredExtracts.contains(EE)) continue; - if (isa(Scalar)) + if (!isa(Scalar) || Entry->isCopyableElement(Scalar)) continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); @@ -19381,21 +20303,29 @@ void BoUpSLP::optimizeGatherSequence() { GatherShuffleExtractSeq.clear(); } -BoUpSLP::ScheduleBundle & -BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL) { +BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle( + ArrayRef VL, const InstructionsState &S, const EdgeInfo &EI) { auto &BundlePtr = ScheduledBundlesList.emplace_back(std::make_unique()); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (S.isNonSchedulable(V)) continue; + auto *I = cast(V); + if (S.isCopyableElement(V)) { + // Add a copyable element model. + ScheduleCopyableData &SD = + addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr); + // Group the instructions to a bundle. + BundlePtr->add(&SD); + continue; + } ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " "(maybe not in same basic block)"); // Group the instructions to a bundle. BundlePtr->add(BundleMember); - ScheduledBundles.try_emplace(cast(V)) - .first->getSecond() - .push_back(BundlePtr.get()); + ScheduledBundles.try_emplace(I).first->getSecond().push_back( + BundlePtr.get()); } assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle"); return *BundlePtr; @@ -19405,11 +20335,15 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL) { // and schedules instructions until the bundle gets ready. std::optional BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, - const InstructionsState &S) { + const InstructionsState &S, + const EdgeInfo &EI) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. + bool HasCopyables = S.areInstructionsWithCopyableElements(); if (isa(S.getMainOp()) || - isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL)) + isVectorLikeInstWithConstOps(S.getMainOp()) || + (!HasCopyables && doesNotNeedToSchedule(VL)) || + all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) return nullptr; // Initialize the instruction bundle. @@ -19417,6 +20351,33 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n"); auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) { + // Clear deps or reculate the region, if the memory instruction is a + // copyable. It may have memory deps, which must be reaculated. + auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) { + SmallDenseMap, unsigned> UserOpToNumOps; + for (ScheduleEntity *SE : Bundle.getBundle()) { + if (ScheduleCopyableData *SD = dyn_cast(SE)) { + if (ScheduleData *BundleMember = getScheduleData(SD->getInst()); + BundleMember && BundleMember->hasValidDependencies()) + BundleMember->clearDirectDependencies(); + continue; + } + auto *SD = cast(SE); + for (const Use &U : SD->getInst()->operands()) { + unsigned &NumOps = + UserOpToNumOps + .try_emplace(std::make_pair(SD->getInst(), U.get()), 0) + .first->getSecond(); + ++NumOps; + if (auto *Op = dyn_cast(U.get()); + Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op, + *SLP, NumOps)) { + if (ScheduleData *OpSD = getScheduleData(Op)) + OpSD->clearDirectDependencies(); + } + } + } + }; // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to // recalculate all dependencies. @@ -19426,10 +20387,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { if (ScheduleData *SD = getScheduleData(I)) SD->clearDependencies(); + if (SmallVector SDs = + getScheduleCopyableData(I); + !SDs.empty()) { + for_each(SDs, + [](ScheduleCopyableData *SD) { SD->clearDependencies(); }); + } } ReSchedule = true; } + // Check if the bundle data has deps for copyable elements already. In + // this case need to reset deps and recalculate it. if (Bundle && !Bundle.getBundle().empty()) { + CheckIfNeedToClearDeps(Bundle); LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block " << BB->getName() << "\n"); calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP); @@ -19448,7 +20418,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, !ReadyInsts.empty()) { ScheduleEntity *Picked = ReadyInsts.pop_back_val(); assert(Picked->isReady() && "must be ready to schedule"); - schedule(Picked, ReadyInsts); + schedule(*SLP, S, EI, Picked, ReadyInsts); if (Picked == &Bundle) break; } @@ -19457,7 +20427,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (S.isNonSchedulable(V)) continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it @@ -19474,11 +20444,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (S.isNonSchedulable(V)) continue; + SmallVector CopyableData = + getScheduleCopyableData(cast(V)); + if (!CopyableData.empty()) { + for (ScheduleCopyableData *SD : CopyableData) + ReadyInsts.remove(SD); + } ScheduleData *BundleMember = getScheduleData(V); - assert(BundleMember && + assert((BundleMember || S.isCopyableElement(V)) && "no ScheduleData for bundle member (maybe not in same basic block)"); + if (!BundleMember) + continue; // Make sure we don't leave the pieces of the bundle in the ready list when // whole bundle might not be ready. @@ -19489,21 +20467,26 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, ReadyInsts.remove(B); } - if (!BundleMember->isScheduled()) + if (!S.isCopyableElement(V) && !BundleMember->isScheduled()) continue; // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the // existing schedule. + // A bundle member has deps calculated before it was copyable element - need + // to reschedule. LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember << " was already scheduled\n"); ReSchedule = true; } - ScheduleBundle &Bundle = buildBundle(VL); + ScheduleBundle &Bundle = buildBundle(VL, S, EI); TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle.isReady()) { - for (ScheduleData *BD : Bundle.getBundle()) { - if (BD->isReady()) { + for (ScheduleEntity *BD : Bundle.getBundle()) { + // Copyable data scheduling is just removed. + if (isa(BD)) + continue; + if (!BD->isReady()) { ArrayRef Bundles = getScheduleBundles(BD->getInst()); if (Bundles.empty()) { ReadyInsts.insert(BD); @@ -19516,9 +20499,49 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, } ScheduledBundlesList.pop_back(); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (S.isNonSchedulable(V)) continue; - ScheduledBundles.find(cast(V))->getSecond().pop_back(); + auto *I = cast(V); + if (S.isCopyableElement(I)) { + // Remove the copyable data from the scheduling region and restore + // previous mappings. + auto KV = std::make_pair(EI, I); + assert(ScheduleCopyableDataMap.contains(KV) && + "no ScheduleCopyableData for copyable element"); + ScheduleCopyableData *SD = + ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val(); + ScheduleCopyableDataMapByUsers[I].remove(SD); + if (EI.UserTE) { + ArrayRef Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, I); + assert(It != Op.end() && "Lane not set"); + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast(EI.UserTE->Scalars[Lane]); + ScheduleCopyableDataMapByInstUser + [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)] + .pop_back(); + It = find(make_range(std::next(It), Op.end()), I); + } while (It != Op.end()); + EdgeInfo UserEI = EI.UserTE->UserTreeIndex; + if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I)) + ScheduleCopyableDataMapByUsers[I].insert(UserCD); + } + if (ScheduleCopyableDataMapByUsers[I].empty()) + ScheduleCopyableDataMapByUsers.erase(I); + ScheduleCopyableDataMap.erase(KV); + // Need to recalculate dependencies for the actual schedule data. + if (ScheduleData *OpSD = getScheduleData(I)) + OpSD->clearDirectDependencies(); + continue; + } + ScheduledBundles.find(I)->getSecond().pop_back(); } return std::nullopt; } @@ -19538,10 +20561,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion( Value *V, const InstructionsState &S) { Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); - assert(!isa(I) && !isVectorLikeInstWithConstOps(I) && - !doesNotNeedToBeScheduled(I) && - "phi nodes/insertelements/extractelements/extractvalues don't need to " - "be scheduled"); if (getScheduleData(I)) return true; if (!ScheduleStart) { @@ -19611,14 +20630,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { // No need to allocate data for non-schedulable instructions. - if (doesNotNeedToBeScheduled(I)) + if (isa(I)) continue; ScheduleData *SD = ScheduleDataMap.lookup(I); if (!SD) { SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; } - assert(!isInSchedulingRegion(SD) && + assert(!isInSchedulingRegion(*SD) && "new ScheduleData already in scheduling region"); SD->init(SchedulingRegionID, I); @@ -19651,24 +20670,101 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP) { - SmallVector WorkList; - auto ProcessNode = [&](ScheduleData *BundleMember) { - if (BundleMember->hasValidDependencies()) + SmallVector WorkList; + auto ProcessNode = [&](ScheduleEntity *SE) { + if (SE->hasValidDependencies()) return; + if (auto *CD = dyn_cast(SE)) { + LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n"); + CD->initDependencies(); + CD->resetUnscheduledDeps(); + const EdgeInfo &EI = CD->getEdgeInfo(); + if (EI.UserTE) { + ArrayRef Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, CD->getInst()); + assert(It != Op.end() && "Lane not set"); + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast(EI.UserTE->Scalars[Lane]); + if (EI.UserTE->isCopyableElement(In)) { + // We may have not have related copyable scheduling data, if the + // instruction is non-schedulable. + if (ScheduleCopyableData *UseSD = + getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) { + CD->incDependencies(); + if (!UseSD->isScheduled()) + CD->incrementUnscheduledDeps(1); + WorkList.push_back(UseSD); + } + } else if (ScheduleData *UseSD = getScheduleData(In)) { + CD->incDependencies(); + if (!UseSD->isScheduled()) + CD->incrementUnscheduledDeps(1); + WorkList.push_back(UseSD); + } + It = find(make_range(std::next(It), Op.end()), CD->getInst()); + } while (It != Op.end()); + if (CD->isReady() && CD->getDependencies() == 0 && + (EI.UserTE->hasState() && + (EI.UserTE->getMainOp()->getParent() != + CD->getInst()->getParent() || + (isa(EI.UserTE->getMainOp()) && + (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) || + any_of(EI.UserTE->getMainOp()->users(), [&](User *U) { + auto *IU = dyn_cast(U); + if (!IU) + return true; + return IU->getParent() == EI.UserTE->getMainOp()->getParent(); + })))))) { + // If no uses in the block - mark as having pseudo-use, which cannot + // be scheduled. + // Prevents incorrect def-use tracking between external user and + // actual instruction. + CD->incDependencies(); + CD->incrementUnscheduledDeps(1); + } + } + return; + } + auto *BundleMember = cast(SE); LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); BundleMember->initDependencies(); BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. + SmallDenseMap UserToNumOps; for (User *U : BundleMember->getInst()->users()) { if (ScheduleData *UseSD = getScheduleData(U)) { + // The operand is a copyable element - skip. + unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond(); + ++NumOps; + if (areAllOperandsReplacedByCopyableData( + cast(U), BundleMember->getInst(), *SLP, NumOps)) + continue; BundleMember->incDependencies(); if (!UseSD->isScheduled()) BundleMember->incrementUnscheduledDeps(1); WorkList.push_back(UseSD); } } + for (ScheduleCopyableData *UseSD : + getScheduleCopyableDataUsers(BundleMember->getInst())) { + BundleMember->incDependencies(); + if (!UseSD->isScheduled()) + BundleMember->incrementUnscheduledDeps(1); + WorkList.push_back(UseSD); + } + SmallPtrSet Visited; auto MakeControlDependent = [&](Instruction *I) { + // Do not mark control dependent twice. + if (!Visited.insert(I).second) + return; auto *DepDest = getScheduleData(I); assert(DepDest && "must be in schedule window"); DepDest->addControlDependency(BundleMember); @@ -19754,7 +20850,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, for (ScheduleData *DepDest = NextLoadStore; DepDest; DepDest = DepDest->getNextLoadStore()) { - assert(isInSchedulingRegion(DepDest) && "Expected to be in region"); + assert(isInSchedulingRegion(*DepDest) && "Expected to be in region"); // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to @@ -19802,8 +20898,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, WorkList.push_back(Bundle.getBundle().front()); SmallPtrSet Visited; while (!WorkList.empty()) { - ScheduleData *SD = WorkList.pop_back_val(); - ArrayRef Bundles = getScheduleBundles(SD->getInst()); + ScheduleEntity *SD = WorkList.pop_back_val(); + SmallVector CopyableBundle; + ArrayRef Bundles; + if (auto *CD = dyn_cast(SD)) { + CopyableBundle.push_back(&CD->getBundle()); + Bundles = CopyableBundle; + } else { + Bundles = getScheduleBundles(SD->getInst()); + } if (Bundles.empty()) { ProcessNode(SD); if (InsertInReadyList && SD->isReady()) { @@ -19838,21 +20941,37 @@ void BoUpSLP::BlockScheduling::resetSchedule() { "tried to reset schedule on block which has not been scheduled"); for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { if (ScheduleData *SD = getScheduleData(I)) { - assert(isInSchedulingRegion(SD) && + assert(isInSchedulingRegion(*SD) && "ScheduleData not in scheduling region"); SD->setScheduled(/*Scheduled=*/false); SD->resetUnscheduledDeps(); } + if (SmallVector SDs = getScheduleCopyableData(I); + !SDs.empty()) { + for_each(SDs, [&](ScheduleCopyableData *SD) { + assert(isInSchedulingRegion(*SD) && + "ScheduleData not in scheduling region"); + SD->setScheduled(/*Scheduled=*/false); + SD->resetUnscheduledDeps(); + }); + } for (ScheduleBundle *Bundle : getScheduleBundles(I)) { assert(isInSchedulingRegion(*Bundle) && "ScheduleBundle not in scheduling region"); Bundle->setScheduled(/*Scheduled=*/false); } } + // Reset schedule data for copyable elements. + for (auto &P : ScheduleCopyableDataMap) { + if (isInSchedulingRegion(*P.second.get())) { + P.second->setScheduled(/*Scheduled=*/false); + P.second->resetUnscheduledDeps(); + } + } ReadyInsts.clear(); } -void BoUpSLP::scheduleBlock(BlockScheduling *BS) { +void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -19890,15 +21009,45 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!Bundle->hasValidDependencies()) BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this); } + SmallVector SDs = BS->getScheduleCopyableData(I); + for (ScheduleCopyableData *SD : reverse(SDs)) { + ScheduleBundle &Bundle = SD->getBundle(); + Bundle.setSchedulingPriority(Idx++); + if (!Bundle.hasValidDependencies()) + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); + } continue; } + SmallVector CopyableData = + BS->getScheduleCopyableDataUsers(I); if (ScheduleData *SD = BS->getScheduleData(I)) { [[maybe_unused]] ArrayRef SDTEs = getTreeEntries(I); assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() || - SDTEs.front()->doesNotNeedToSchedule()) && + SDTEs.front()->doesNotNeedToSchedule() || + doesNotNeedToBeScheduled(I)) && "scheduler and vectorizer bundle mismatch"); SD->setSchedulingPriority(Idx++); - continue; + if (!SD->hasValidDependencies() && + (!CopyableData.empty() || + any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) { + assert(TE->isGather() && "expected gather node"); + return TE->hasState() && TE->hasCopyableElements() && + TE->isCopyableElement(I); + }))) { + // Need to calculate deps for these nodes to correctly handle copyable + // dependencies, even if they were cancelled. + // If copyables bundle was cancelled, the deps are cleared and need to + // recalculate them. + ScheduleBundle Bundle; + Bundle.add(SD); + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); + } + } + for (ScheduleCopyableData *SD : reverse(CopyableData)) { + ScheduleBundle &Bundle = SD->getBundle(); + Bundle.setSchedulingPriority(Idx++); + if (!Bundle.hasValidDependencies()) + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); } } BS->initialFillReadyList(ReadyInsts); @@ -19914,9 +21063,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // Move the scheduled instruction(s) to their dedicated places, if not // there yet. if (auto *Bundle = dyn_cast(Picked)) { - for (const ScheduleData *BundleMember : Bundle->getBundle()) { + for (const ScheduleEntity *BundleMember : Bundle->getBundle()) { Instruction *PickedInst = BundleMember->getInst(); - if (!Scheduled.insert(PickedInst).second) + // If copyable must be schedule as part of something else, skip it. + bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst); + if ((IsCopyable && BS->getScheduleData(PickedInst)) || + (!IsCopyable && !Scheduled.insert(PickedInst).second)) continue; if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst) PickedInst->moveAfter(LastScheduledInst->getPrevNode()); @@ -19931,7 +21083,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { PickedInst->moveAfter(LastScheduledInst->getPrevNode()); LastScheduledInst = PickedInst; } - BS->schedule(Picked, ReadyInsts); + auto Invalid = InstructionsState::invalid(); + BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts); } // Check that we didn't break any of our invariants. @@ -20145,7 +21298,7 @@ bool BoUpSLP::collectValuesToDemote( }; if (E.isGather() || !Visited.insert(&E).second || any_of(E.Scalars, [&](Value *V) { - return !isa(V) && all_of(V->users(), [&](User *U) { + return !isa(V) && all_of(V->users(), [&](User *U) { return isa(U) && !isVectorized(U); }); })) @@ -20611,7 +21764,12 @@ void BoUpSLP::computeMinimumValueSizes() { if (!IsKnownPositive) ++BitWidth1; - APInt Mask = DB->getDemandedBits(cast(Root)); + auto *I = dyn_cast(Root); + if (!I) { + MaxBitWidth = std::max(BitWidth1, MaxBitWidth); + continue; + } + APInt Mask = DB->getDemandedBits(I); unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); MaxBitWidth = std::max(std::min(BitWidth1, BitWidth2), MaxBitWidth); @@ -20940,7 +22098,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, for (Value *V : Chain) ValOps.insert(cast(V)->getValueOperand()); // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. - InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true); if (all_of(ValOps, IsaPred) && ValOps.size() > 1) { DenseSet Stores(Chain.begin(), Chain.end()); bool IsAllowedSize = diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index 7ed5f33c9dc6c..c791a07993440 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -4,11 +4,7 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ADD]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 +; CHECK-NEXT: [[SELECT:%.*]] = select i1 false, i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]] ; CHECK-NEXT: store ptr addrspace(1) null, ptr addrspace(1) [[GETELEMENTPTR]], align 8 @@ -16,8 +12,6 @@ define void @test() { ; CHECK-NEXT: [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer) ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll index fa46bd3d83249..d46098e754136 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll @@ -19,13 +19,13 @@ define void @test(ptr %0, i32 %add651) { ; CHECK-NEXT: [[ARRAYIDX660:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7800 ; CHECK-NEXT: [[ARRAYIDX689:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7816 ; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[ADD651]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP13]], <2 x i32> [[TMP10]], i64 2) -; CHECK-NEXT: [[TMP15:%.*]] = lshr <4 x i32> [[TMP14]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> , i32 [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = lshr <4 x i32> [[TMP19]], splat (i32 1) ; CHECK-NEXT: [[SHR685:%.*]] = lshr i32 [[TMP2]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[CONV686:%.*]] = trunc i32 [[SHR685]] to i16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll index 992909fb3e87f..5e3d4715e99c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll @@ -7,16 +7,10 @@ define i32 @test() { ; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]] ; CHECK: [[FUNC_135_EXIT_I]]: ; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v12i32(<16 x i32> poison, <12 x i32> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll index 1c482e079bb0f..03d76ef571d64 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll @@ -4,11 +4,10 @@ define i64 @test() { ; CHECK-LABEL: define i64 @test() { ; CHECK-NEXT: [[BB:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 0, i32 1 ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> zeroinitializer, [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: br label %[[BB5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll index 382d6ae0e0a6f..6bb52e0fc43b3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll @@ -7,19 +7,17 @@ define void @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[IF_THEN_I_I:.*]]: -; CHECK-NEXT: br label %[[BB5:.*]] +; CHECK-NEXT: br label %[[BB3:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i1 false to i64 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> , <2 x i64> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> , <2 x i64> [[TMP2]], i64 2) -; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB2:.*]] -; CHECK: [[BB5]]: -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> , i64 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: br i1 false, label %[[BB3]], label %[[BB2:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ [[TMP2]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] ; CHECK-NEXT: br label %[[BB2]] ; CHECK: [[BB2]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB3]] ], [ [[TMP2]], %[[BB1]] ] ; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll index 9fbe0a54b0688..64344342ffe3a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -59,8 +59,6 @@ define void @pr35497() local_unnamed_addr #0 { ; SSE-LABEL: @pr35497( ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1 -; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef -; SSE-NEXT: store i64 [[ADD]], ptr undef, align 1 ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4 ; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 ; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 2) @@ -68,32 +66,34 @@ define void @pr35497() local_unnamed_addr #0 { ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> ; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer ; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1 -; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 +; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef +; SSE-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], +; SSE-NEXT: store i64 [[ADD]], ptr undef, align 1 ; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], splat (i64 2) ; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], splat (i64 20) +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> ; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], splat (i64 6) -; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]] +; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP12]], [[TMP10]] ; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( ; AVX-NEXT: entry: ; AVX-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1 -; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef -; AVX-NEXT: store i64 [[ADD]], ptr undef, align 1 ; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 2) ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], splat (i64 20) ; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1 +; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef +; AVX-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], +; AVX-NEXT: store i64 [[ADD]], ptr undef, align 1 ; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], splat (i64 2) ; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], splat (i64 20) +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> ; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], splat (i64 6) -; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]] +; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP11]], [[TMP9]] ; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll index a4949bc67b0f1..782aada17acac 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -6,14 +6,9 @@ target triple = "x86_64-unknown-linux-gnu" define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 -; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECINIT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VECINIT51:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: ret <4 x i32> [[VECINIT51]] ; %vecinit = insertelement <4 x i32> undef, i32 %f, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll index a17ccb4b46ef9..a56c6b76ba39f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll @@ -5,9 +5,11 @@ define i1 @test() { ; CHECK-LABEL: define i1 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[H_PROMOTED118_I_FR:%.*]] = freeze i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> , i32 [[H_PROMOTED118_I_FR]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[H_PROMOTED118_I_FR]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[TMP6]], diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll index ad4daeab003f5..125c2dce32663 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll @@ -150,9 +150,9 @@ define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead) define <2 x i8> @replace_through_binop_fail_cant_speculate(i8 %inp, <2 x i8> %d, <2 x i8> %any) { ; CHECK-LABEL: define <2 x i8> @replace_through_binop_fail_cant_speculate( ; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[INP]], 5 -; CHECK-NEXT: [[V0:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i64 0 -; CHECK-NEXT: [[V:%.*]] = insertelement <2 x i8> [[V0]], i8 [[ADD]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[V:%.*]] = add <2 x i8> [[TMP2]], ; CHECK-NEXT: [[DIV0:%.*]] = sdiv <2 x i8> splat (i8 -128), [[V]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[INP]], 123 ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i8> [[DIV0]], i8 [[TMP1]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll index afe92f89ac0d1..11c4dc9f16880 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -332,12 +332,12 @@ define void @test11(<2 x i64> %0, i64 %1, <2 x i64> %2) { ; CHECK-LABEL: @test11( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> , [[TMP2:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[TMP4]] to <2 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP5]], i64 0) -; CHECK-NEXT: [[TMP7:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i16> -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP6]], <2 x i16> [[TMP7]], i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i16> [[TMP8]] to <4 x i8> +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> , i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> zeroinitializer, i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP2:%.*]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP6]], <2 x i64> [[TMP3]], i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i64> [[TMP8]] to <4 x i8> ; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> poison, <2 x i8> zeroinitializer, i64 0) ; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP10]], <2 x i8> zeroinitializer, i64 2) ; CHECK-NEXT: [[TMP12:%.*]] = urem <4 x i8> [[TMP9]], [[TMP11]]