From 652e7a5fb486c992c90bf0bdfcadfb12fb41b72c Mon Sep 17 00:00:00 2001 From: Zach Goldthorpe Date: Mon, 30 Jun 2025 09:24:53 -0500 Subject: [PATCH 1/2] Squashing wip branch. --- llvm/include/llvm/InitializePasses.h | 1 + llvm/include/llvm/Transforms/Scalar.h | 7 + .../Scalar/PackedIntegerCombinePass.h | 32 + llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassBuilderPipelines.cpp | 7 + llvm/lib/Passes/PassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 11 +- llvm/lib/Transforms/Scalar/CMakeLists.txt | 1 + .../Scalar/PackedIntegerCombinePass.cpp | 1936 +++++++++++++++++ llvm/lib/Transforms/Scalar/Scalar.cpp | 1 + .../CodeGen/AMDGPU/combine-vload-extract.ll | 6 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 3 + llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 105 +- .../AMDGPU/splitkit-getsubrangeformask.ll | 168 +- llvm/test/Other/new-pm-defaults.ll | 1 + .../Other/new-pm-thinlto-postlink-defaults.ll | 1 + .../new-pm-thinlto-postlink-pgo-defaults.ll | 1 + ...-pm-thinlto-postlink-samplepgo-defaults.ll | 1 + .../Other/new-pm-thinlto-prelink-defaults.ll | 1 + .../new-pm-thinlto-prelink-pgo-defaults.ll | 1 + ...w-pm-thinlto-prelink-samplepgo-defaults.ll | 1 + .../PackedIntegerCombine/instructions.ll | 601 +++++ .../PackedIntegerCombine/int2int.ll | 302 +++ .../PackedIntegerCombine/int2vec.ll | 393 ++++ .../PackedIntegerCombine/vec2int.ll | 480 ++++ .../PackedIntegerCombine/vec2vec.ll | 294 +++ 26 files changed, 4173 insertions(+), 184 deletions(-) create mode 100644 llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h create mode 100644 llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp create mode 100644 llvm/test/Transforms/PackedIntegerCombine/instructions.ll create mode 100644 llvm/test/Transforms/PackedIntegerCombine/int2int.ll create mode 100644 llvm/test/Transforms/PackedIntegerCombine/int2vec.ll create mode 100644 llvm/test/Transforms/PackedIntegerCombine/vec2int.ll create mode 100644 llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 1c4ed3843b390..7e934e635c063 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -237,6 +237,7 @@ initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &); LLVM_ABI void initializeOptimizePHIsLegacyPass(PassRegistry &); LLVM_ABI void initializePEILegacyPass(PassRegistry &); LLVM_ABI void initializePHIEliminationPass(PassRegistry &); +LLVM_ABI void initializePackedIntegerCombineLegacyPassPass(PassRegistry &); LLVM_ABI void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &); LLVM_ABI void initializePatchableFunctionLegacyPass(PassRegistry &); LLVM_ABI void initializePeepholeOptimizerLegacyPass(PassRegistry &); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index 1398f171b0f78..ec9d89507c375 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -154,6 +154,13 @@ LLVM_ABI FunctionPass * createInferAddressSpacesPass(unsigned AddressSpace = ~0u); LLVM_ABI extern char &InferAddressSpacesID; +//===----------------------------------------------------------------------===// +// +// PackedIntegerCombinePass - Tracks individual bytes through instructions to +// systematically identify redundant byte packing or unpacking operations. +// +LLVM_ABI FunctionPass *createPackedIntegerCombinePass(); + //===----------------------------------------------------------------------===// // // PartiallyInlineLibCalls - Tries to inline the fast path of library diff --git a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h new file mode 100644 index 0000000000000..a5916e2e611cf --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h @@ -0,0 +1,32 @@ +//===- PackedIntegerCombinePass.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file provides the interface for LLVM's Packed Integer Combine pass. +/// This pass tries to treat integers as packed chunks of individual bytes, +/// and leverage this to coalesce needlessly fragmented +/// computations. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H +#define LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class PackedIntegerCombinePass + : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 0697a0a6b4c74..7a382ace34dbc 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -313,6 +313,7 @@ #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/NewGVN.h" +#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h" #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h" #include "llvm/Transforms/Scalar/PlaceSafepoints.h" #include "llvm/Transforms/Scalar/Reassociate.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index c83d2dc1f1514..2da72606bc47a 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -121,6 +121,7 @@ #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" #include "llvm/Transforms/Scalar/NewGVN.h" +#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h" #include "llvm/Transforms/Scalar/Reassociate.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Scalar/SROA.h" @@ -542,6 +543,9 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, // opportunities that creates). FPM.addPass(BDCEPass()); + // Simplify bit-packed operations before cleaning up with instcombine. + FPM.addPass(PackedIntegerCombinePass()); + // Run instcombine after redundancy and dead bit elimination to exploit // opportunities opened up by them. FPM.addPass(InstCombinePass()); @@ -743,6 +747,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // opportunities that creates). FPM.addPass(BDCEPass()); + // Simplify bit-packed operations before cleaning up with instcombine. + FPM.addPass(PackedIntegerCombinePass()); + // Run instcombine after redundancy and dead bit elimination to exploit // opportunities opened up by them. FPM.addPass(InstCombinePass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 65276489e6f02..6f1c405a5efa7 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -476,6 +476,7 @@ FUNCTION_PASS("objc-arc", ObjCARCOptPass()) FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass()) FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass()) FUNCTION_PASS("pa-eval", PAEvalPass()) +FUNCTION_PASS("packedintcombine", PackedIntegerCombinePass()) FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass()) FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt()) FUNCTION_PASS("place-safepoints", PlaceSafepointsPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c3536113e9bef..8f0ef348fe778 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -104,6 +104,7 @@ #include "llvm/Transforms/Scalar/InferAddressSpaces.h" #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" +#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h" #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/Transforms/Scalar/Sink.h" #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" @@ -1378,8 +1379,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() { TargetPassConfig::addCodeGenPrepare(); - if (isPassEnabled(EnableLoadStoreVectorizer)) + if (isPassEnabled(EnableLoadStoreVectorizer)) { addPass(createLoadStoreVectorizerPass()); + // LSV pass opens up more opportunities for packed integer combining. + addPass(createPackedIntegerCombinePass()); + } // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it @@ -2101,8 +2105,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { Base::addCodeGenPrepare(addPass); - if (isPassEnabled(EnableLoadStoreVectorizer)) + if (isPassEnabled(EnableLoadStoreVectorizer)) { addPass(LoadStoreVectorizerPass()); + // LSV pass opens up more opportunities for packed integer combining. + addPass(PackedIntegerCombinePass()); + } // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 84a5b02043d01..d45a3785f9f8f 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -61,6 +61,7 @@ add_llvm_component_library(LLVMScalarOpts NaryReassociate.cpp NewGVN.cpp PartiallyInlineLibCalls.cpp + PackedIntegerCombinePass.cpp PlaceSafepoints.cpp Reassociate.cpp Reg2Mem.cpp diff --git a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp new file mode 100644 index 0000000000000..31edd28069a2b --- /dev/null +++ b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp @@ -0,0 +1,1936 @@ +//===- PackedIntegerCombinePass.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This file provides the interface for LLVM's Packed Integer Combine pass. +/// This pass tries to treat integers as packed chunks of individual bytes, +/// and leverage this to coalesce needlessly fragmented +/// computations. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +#define DEBUG_TYPE "packedintcombine" + +static cl::opt MaxCollectionIterations( + "packedint-max-iterations", + cl::desc("Maximum number of iterations to isolate final packed " + "instructions. Set to 0 to iterate until convergence."), + cl::init(2), cl::Hidden); + +static cl::opt + AggressiveRewriting("packedint-aggressive-rewriter", + cl::desc("Aggressively rewrite packed instructions."), + cl::init(false), cl::Hidden); + +namespace { + +/// Reference to either a constant byte, or a byte extracted from an IR value. +class Byte { + /// The base value from which the byte is obtained. + Value *Base; + + /// If the base value is not null, then this holds the index of the byte + /// being used, where 0 is the least significant byte. + /// Otherwise, this is treated as a constant byte. + unsigned Integer; + +public: + static constexpr unsigned BitWidth = 8; + static constexpr unsigned AllOnes = 0xff; + + /// Construct a byte from a well-defined IR value. + explicit Byte(Value &Base, unsigned Index) : Base(&Base), Integer(Index) {} + + /// Construct a constant byte. + explicit Byte(unsigned Constant) : Base(nullptr), Integer(Constant) { + assert(Constant <= AllOnes && "Constant is too large to fit in a byte."); + } + + /// Construct a constant byte that is fully set. + static Byte ones() { return Byte(Byte::AllOnes); } + /// Construct the zero byte. + static Byte zeroes() { return Byte(0); } + + /// Indicate whether the byte is a known integer constant. + /// Note that poison or undef base values are not recognised as constant. + bool isConstant() const { return !Base; } + + /// Get the constant byte value. + unsigned getConstant() const { + assert(isConstant() && "Expected a constant byte."); + return Integer; + } + + /// Get the base IR value from which this byte is obtained. + Value *getBase() const { + assert(!isConstant() && "Byte constants do not have a base value."); + return Base; + } + + /// Get the byte offset of the IR value referenced by the byte. + unsigned getIndex() const { + assert(!isConstant() && "Byte constants are not indexed."); + return Integer; + } + + bool operator==(const Byte &Other) const { + return Base == Other.Base && Integer == Other.Integer; + } + + void print(raw_ostream &ROS, bool NewLine = true) const { + if (isConstant()) + ROS << "const"; + else + Base->printAsOperand(ROS, false); + + ROS << "[" << Integer << "]"; + + if (NewLine) + ROS << "\n"; + } + + LLVM_DUMP_METHOD void dump() const { print(errs(), true); } +}; + +inline raw_ostream &operator<<(raw_ostream &ROS, const Byte &B) { + B.print(ROS, false); + return ROS; +} + +/// Convenience data structure for describing the layout of bytes for vector and +/// integer types, treating integer types as singleton vectors. +struct ByteLayout { + /// The number of bytes that fit in a single element. + unsigned NumBytesPerElement; + /// The number of vector elements (or 1, if the type is an integer type). + unsigned NumVecElements; + + /// Get the total number of bytes held by the vector or integer type. + unsigned getNumBytes() const { return NumBytesPerElement * NumVecElements; } +}; + +/// Interpret the given type as a number of packed bytes, if possible. +static std::optional tryGetByteLayout(const Type *Ty) { + unsigned IntBitWidth, NumElts; + if (const auto *IntTy = dyn_cast(Ty)) { + IntBitWidth = IntTy->getBitWidth(); + NumElts = 1; + } else if (const auto *VecTy = dyn_cast(Ty)) { + const auto *IntTy = dyn_cast(VecTy->getElementType()); + if (!IntTy) + return std::nullopt; + IntBitWidth = IntTy->getBitWidth(); + NumElts = VecTy->getNumElements(); + } else + return std::nullopt; + + if (IntBitWidth % Byte::BitWidth != 0) + return std::nullopt; + + return ByteLayout{IntBitWidth / Byte::BitWidth, NumElts}; +} + +/// Interpret the given type as a number of backed bytes (aborts if impossible). +static ByteLayout getByteLayout(const Type *Ty) { + const std::optional Layout = tryGetByteLayout(Ty); + assert(Layout); + return *Layout; +} + +/// A convenience class for combining Byte instances obtained from the same base +/// value, and with a common relative offset, which can hence be obtained +/// simultaneously. +struct CoalescedBytes { + /// The value from which the coalesced bytes are all derived. This pointer is + /// never null. + Value *Base; + /// The number of bytes to shift right to align the coalesced bytes with the + /// target value. + /// + /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide + /// bytes 0, 1, 2 of the target %tgt, then ShrByteOffset = 3. + signed SignedShrByteOffset; + /// The bitmask identifying which bytes of the target value are covered by + /// these coalesced bytes. + /// + /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide + /// bytes 0, 1, 2 of the target %tgt, then this mask's first three bits will + /// be set, corresponding to the first three bits of %tgt. + SmallBitVector Mask; + + explicit CoalescedBytes(Value &Base, signed Offset, SmallBitVector Mask) + : Base(&Base), SignedShrByteOffset(Offset), Mask(Mask) {} + explicit CoalescedBytes(Value &Base, signed Offset, unsigned NumBytes) + : Base(&Base), SignedShrByteOffset(Offset), Mask(NumBytes) {} + + bool alignsWith(Value *V, signed VOffset) const { + return Base == V && SignedShrByteOffset == VOffset; + } + + /// Get the number of bytes to shift the base value right to align with the + /// target value. + unsigned getShrBytes() const { return std::max(0, SignedShrByteOffset); } + + /// Get the number of bytes to shift the base value left to align with the + /// target value. + unsigned getShlBytes() const { return std::max(0, -SignedShrByteOffset); } + + /// Get the number of bits to shift the base value right to align with the + /// target value. + unsigned getShrBits() const { return getShrBytes() * Byte::BitWidth; } + + /// Get the number of bits to shift the base value left to align with the + /// target value. + unsigned getShlBits() const { return getShlBytes() * Byte::BitWidth; } + + void print(raw_ostream &ROS, bool NewLine = true) const { + ROS << "{ "; + for (unsigned Idx = 0; Idx < Mask.size(); ++Idx) { + if (Mask.test(Idx)) { + Base->printAsOperand(ROS, false); + ROS << "[" << (static_cast(Idx) + SignedShrByteOffset) << "]"; + } else + ROS << 0; + + ROS << "; "; + } + ROS << "}"; + + if (NewLine) + ROS << "\n"; + } + + LLVM_DUMP_METHOD void dump() const { print(errs(), true); } +}; + +inline raw_ostream &operator<<(raw_ostream &ROS, const CoalescedBytes &CB) { + CB.print(ROS, false); + return ROS; +} + +/// Association of a Byte (constant or byte extracted from an LLVM Value) to the +/// operand(s) responsible for producing it. A value of ByteUse::AllOperands +/// (-1) indicates that all operands are responsible for producing the given +/// byte. +class ByteUse { + + Byte B; + int OpIdx; + +public: + /// Sentinel value representing that all operands are responsible for the + /// given Byte. + static constexpr int AllOperands = -1; + + ByteUse(Byte B, int OpIdx) : B(B), OpIdx(OpIdx) {} + + const Byte &getByte() const { return B; } + int getOperandIndex() const { return OpIdx; } + + bool operator==(const ByteUse &BU) const { + return BU.B == B && BU.OpIdx == OpIdx; + } +}; + +using ByteVector = SmallVector; + +/// The decomposition of an IR value into its individual bytes, tracking where +/// each byte is obtained. +class ByteDefinition { + /// Enum classifying what Ptr points to. + enum ByteType : uint8_t { + /// Ptr's value is undefined. + INVALID, + /// The byte definition is given by a ByteVector, which is referenced (but + /// not captured) by Ptr. + VECTOR, + /// The bytes are obtained from a (currently opaque) IR value, held by Ptr. + VALUE, + /// The bytes are obtained from a constant integer, held by Ptr. + CONST_INT, + /// The bytes are obtained from a constant vector of integers, held by Ptr. + CONST_VEC, + }; + + ByteType DefType; + void *Ptr; + ByteLayout Layout; + ByteDefinition(ByteType DefType, void *Ptr, ByteLayout Layout) + : DefType(DefType), Ptr(Ptr), Layout(Layout) {} + +public: + /// Indicate that a value cannot be decomposed into bytes in a known way. + static ByteDefinition invalid() { return {INVALID, nullptr, {0, 0}}; } + /// Indicate that a value's bytes are known, and track their producers. + static ByteDefinition vector(ByteVector &Ref, ByteLayout Layout) { + return {VECTOR, &Ref, Layout}; + } + /// Indicate that a value's bytes are opaque. + static ByteDefinition value(Value &V) { + return {VALUE, &V, getByteLayout(V.getType())}; + } + /// Indicate that the bytes come from a constant integer. + static ByteDefinition constInt(ConstantInt &Int) { + return {CONST_INT, &Int, getByteLayout(Int.getType())}; + } + /// Indicate that the bytes come from a constant vector of integers. + static ByteDefinition constVec(Constant &Vec) { + assert(Vec.getType()->isVectorTy()); + return {CONST_VEC, &Vec, getByteLayout(Vec.getType())}; + } + + ByteVector &getVector() const { + assert(DefType == VECTOR); + return *static_cast(Ptr); + } + Value &getValue() const { + assert(DefType == VALUE); + return *static_cast(Ptr); + } + ConstantInt &getConstInt() const { + assert(DefType == CONST_INT); + return *static_cast(Ptr); + } + Constant &getConstVec() const { + assert(DefType == CONST_VEC); + return *static_cast(Ptr); + } + + bool isValid() const { return DefType != INVALID; } + + /// Return true iff the byte definition is valid. + operator bool() const { return isValid(); } + + /// Get the definition of the byte at the specified byte offset, where 0 is + /// the least significant byte. + Byte getByte(unsigned Idx) const { + switch (DefType) { + default: + llvm_unreachable("Invalid byte definition"); + case VECTOR: + return getVector()[Idx].getByte(); + case VALUE: + return Byte(getValue(), Idx); + case CONST_INT: + return Byte(getConstInt().getValue().extractBitsAsZExtValue( + Byte::BitWidth, Idx * Byte::BitWidth)); + case CONST_VEC: { + const auto &Vec = getConstVec(); + const ByteLayout Layout = getByteLayout(Vec.getType()); + const unsigned VecIdx = Idx / Layout.NumBytesPerElement; + const unsigned EltIdx = Idx % Layout.NumBytesPerElement; + + Constant *Elt = Vec.getAggregateElement(VecIdx); + if (const auto *Int = dyn_cast(Elt)) + return Byte(Int->getValue().extractBitsAsZExtValue( + Byte::BitWidth, EltIdx * Byte::BitWidth)); + + return Byte(*Elt, EltIdx); + } + } + } + + const ByteLayout &getLayout() const { return Layout; } + + void print(raw_ostream &ROS, bool NewLine = true) const { + switch (DefType) { + default: + ROS << "[INVALID]"; + break; + case VECTOR: { + ByteVector &BV = getVector(); + ROS << "{ "; + for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx) + ROS << ByteIdx << ": " << BV[ByteIdx].getByte() << "; "; + ROS << "}"; + break; + } + case VALUE: + ROS << "("; + getValue().printAsOperand(ROS); + ROS << ")[0:" << Layout.getNumBytes() << "]"; + break; + case CONST_INT: + ROS << getConstInt(); + break; + case CONST_VEC: + ROS << getConstVec(); + break; + } + + if (NewLine) + ROS << "\n"; + } + + LLVM_DUMP_METHOD void dump() const { print(errs(), true); } +}; + +inline raw_ostream &operator<<(raw_ostream &ROS, const ByteDefinition &Def) { + Def.print(ROS, false); + return ROS; +} + +/// Tries to update byte definitions using the provided instruction. +/// +/// In order to avoid eliminating values which are required for multiple packed +/// integers, the ByteExpander distinguishes two types of packed integer values: +/// - "Final" values, which are packed bytes which are either used by +/// instructions that cannot be classified as packed byte operations, or +/// values which are used by several other "final" values. +/// - "Intermediate" values, which are values whose sole raisons d'etre are to +/// produce bytes for a unique final value. +/// +/// Effectively, intermediate values may be eliminated or replaced freely, +/// whereas final values must remain present in the IR after the pass completes. +/// Accordingly, byte defniitions of final values are expanded only up to other +/// final value producers. +class ByteExpander final : public InstVisitor { + /// Resolution of values to their known definitions. + DenseMap Definitions; + /// Map to all (eventual) non-intermediate users of a value. + DenseMap> FinalUsers; + + void updateFinalUsers(Value *V); + bool checkIfIntermediate(Value *V, bool IsOperand); + +public: + // Visitation implementations return `true` iff a new byte definition was + // successfully constructed. + + ByteVector visitAdd(BinaryOperator &I); + ByteVector visitAnd(BinaryOperator &I); + ByteVector visitOr(BinaryOperator &I); + ByteVector visitXor(BinaryOperator &I); + ByteVector visitShl(BinaryOperator &I); + ByteVector visitLShr(BinaryOperator &I); + ByteVector visitTruncInst(TruncInst &I); + ByteVector visitZExtInst(ZExtInst &I); + ByteVector visitBitCastInst(BitCastInst &I); + ByteVector visitExtractElementInst(ExtractElementInst &I); + ByteVector visitInsertElementInst(InsertElementInst &I); + ByteVector visitShuffleVectorInst(ShuffleVectorInst &I); + // fallback for unhandled instructions + ByteVector visitInstruction(Instruction &I) { return {}; } + + /// Return the final values producing each byte of a value, if known, or + /// otherwise return a nullptr. + ByteVector *expandByteDefinition(Value *V); + + /// Decompose a value into its bytes. If \p ExpandDef is true, expand each + /// byte to the final values producing them if possible. The return value is + /// guaranteed to be valid so long as the value passed can be viewed as packed + /// bytes. + ByteDefinition getByteDefinition(Value *V, bool ExpandDef = true); + + /// Same as above, but only expand bytes to their final value producers if the + /// value \p V in question is an intermediate value. This is provided as a + /// convenience for instruction visitation, as definitions should only expand + /// until final value producers, even if the final value producers' bytes can + /// be expanded further. + ByteDefinition getByteDefinitionIfIntermediateOperand(Value *V); + + /// Get the set of all final values which use \p V. + const DenseSet &getFinalUsers(Value *V); + + /// Check if the provided value is known to be an intermediate value. + bool checkIfIntermediate(Value *V) { return checkIfIntermediate(V, false); } + + /// Iterate over all instructions in a function over several passes to + /// identify all final values and their byte definitions. + std::vector collectPIICandidates(Function &F); +}; + +ByteVector ByteExpander::visitAdd(BinaryOperator &I) { + const ByteDefinition LhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + const ByteDefinition RhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(1)); + if (!LhsDef || !RhsDef) + return {}; + + const ByteLayout &Layout = LhsDef.getLayout(); + const unsigned NumBytes = Layout.getNumBytes(); + + ByteVector BV; + BV.reserve(NumBytes); + + for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) { + const Byte Lhs = LhsDef.getByte(ByteIdx); + const Byte Rhs = RhsDef.getByte(ByteIdx); + + const bool LhsIsZero = Lhs.isConstant() && Lhs.getConstant() == 0; + const bool RhsIsZero = Rhs.isConstant() && Rhs.getConstant() == 0; + if (LhsIsZero) + BV.emplace_back(Rhs, RhsIsZero ? ByteUse::AllOperands : 1); + else if (RhsIsZero) + BV.emplace_back(Lhs, 0); + else + return {}; + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitAnd(BinaryOperator &I) { + const ByteDefinition LhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + const ByteDefinition RhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(1)); + if (!LhsDef || !RhsDef) + return {}; + + const ByteLayout &Layout = LhsDef.getLayout(); + const unsigned NumBytes = Layout.getNumBytes(); + + ByteVector BV; + BV.reserve(NumBytes); + + for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) { + const Byte Lhs = LhsDef.getByte(ByteIdx); + const Byte Rhs = RhsDef.getByte(ByteIdx); + + if (Lhs == Rhs) { + BV.emplace_back(Lhs, ByteUse::AllOperands); + continue; + } + + if (Lhs.isConstant()) { + if (Lhs.getConstant() == 0) { + BV.emplace_back(Byte::zeroes(), 0); + continue; + } + if (Lhs.getConstant() == Byte::AllOnes) { + BV.emplace_back(Rhs, 1); + continue; + } + } + if (Rhs.isConstant()) { + if (Rhs.getConstant() == 0) { + BV.emplace_back(Byte::zeroes(), 1); + continue; + } + if (Rhs.getConstant() == Byte::AllOnes) { + BV.emplace_back(Lhs, 0); + continue; + } + } + + if (Lhs == Rhs) { + BV.emplace_back(Lhs, ByteUse::AllOperands); + continue; + } + return {}; + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitOr(BinaryOperator &I) { + const ByteDefinition LhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + const ByteDefinition RhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(1)); + if (!LhsDef || !RhsDef) + return {}; + + const ByteLayout &Layout = LhsDef.getLayout(); + const unsigned NumBytes = Layout.getNumBytes(); + + ByteVector BV; + BV.reserve(NumBytes); + + for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) { + const Byte Lhs = LhsDef.getByte(ByteIdx); + const Byte Rhs = RhsDef.getByte(ByteIdx); + + if (Lhs == Rhs) { + BV.emplace_back(Lhs, ByteUse::AllOperands); + continue; + } + + if (Lhs.isConstant()) { + if (Lhs.getConstant() == 0) { + BV.emplace_back(Rhs, 1); + continue; + } + if (Lhs.getConstant() == Byte::AllOnes) { + BV.emplace_back(Byte::ones(), 0); + continue; + } + } + + if (Rhs.isConstant()) { + if (Rhs.getConstant() == 0) { + BV.emplace_back(Lhs, 0); + continue; + } + if (Rhs.getConstant() == Byte::AllOnes) { + BV.emplace_back(Byte::ones(), 1); + continue; + } + } + + return {}; + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitXor(BinaryOperator &I) { + const ByteDefinition LhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + const ByteDefinition RhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(1)); + if (!LhsDef || !RhsDef) + return {}; + + const ByteLayout &Layout = LhsDef.getLayout(); + const unsigned NumBytes = Layout.getNumBytes(); + + ByteVector BV; + BV.reserve(NumBytes); + + for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) { + const Byte Lhs = LhsDef.getByte(ByteIdx); + const Byte Rhs = RhsDef.getByte(ByteIdx); + if (Lhs == Rhs) + BV.emplace_back(Byte::zeroes(), ByteUse::AllOperands); + else if (Lhs.isConstant() && Lhs.getConstant() == 0) + BV.emplace_back(Rhs, 1); + else if (Rhs.isConstant() && Rhs.getConstant() == 0) + BV.emplace_back(Lhs, 0); + else + return {}; + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitShl(BinaryOperator &I) { + const ByteDefinition BaseDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!BaseDef) + return {}; + + const unsigned NumBytes = BaseDef.getLayout().getNumBytes(); + + const auto *Const = dyn_cast(I.getOperand(1)); + if (!Const) + return {}; + + if (isa(Const)) { + const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue(); + if (ShAmt % Byte::BitWidth != 0) + return {}; + const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytes); + + ByteVector BV; + BV.reserve(NumBytes); + BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0)); + for (unsigned ByteIdx = 0; ByteIdx + ByteShAmt < NumBytes; ++ByteIdx) + BV.emplace_back(BaseDef.getByte(ByteIdx), 0); + + assert(BV.size() == NumBytes); + return BV; + } + + assert(Const->getType()->isVectorTy()); + + ByteVector BV; + BV.reserve(NumBytes); + + const unsigned NumBytesPerElt = BaseDef.getLayout().NumBytesPerElement; + for (unsigned EltIdx = 0; EltIdx < BaseDef.getLayout().NumVecElements; + ++EltIdx) { + const auto *ConstInt = + dyn_cast(Const->getAggregateElement(EltIdx)); + if (!ConstInt) + return {}; + const unsigned ShAmt = ConstInt->getValue().getLimitedValue(); + if (ShAmt % Byte::BitWidth != 0) + return {}; + + const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytesPerElt); + BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0)); + for (unsigned ByteIdx = 0; ByteIdx + ByteShAmt < NumBytesPerElt; ++ByteIdx) + BV.emplace_back(BaseDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 0); + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitLShr(BinaryOperator &I) { + const ByteDefinition BaseDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!BaseDef) + return {}; + + const unsigned NumBytes = BaseDef.getLayout().getNumBytes(); + + const auto *Const = dyn_cast(I.getOperand(1)); + if (!Const) + return {}; + + if (isa(Const)) { + const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue(); + if (ShAmt % Byte::BitWidth != 0) + return {}; + const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytes); + + ByteVector BV; + BV.reserve(NumBytes); + for (unsigned ByteIdx = ByteShAmt; ByteIdx < NumBytes; ++ByteIdx) + BV.emplace_back(BaseDef.getByte(ByteIdx), 0); + + BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0)); + + assert(BV.size() == NumBytes); + return BV; + } + + assert(Const->getType()->isVectorTy()); + + ByteVector BV; + BV.reserve(NumBytes); + + const unsigned NumBytesPerElt = BaseDef.getLayout().NumBytesPerElement; + + for (unsigned EltIdx = 0; EltIdx < BaseDef.getLayout().NumVecElements; + ++EltIdx) { + const auto *ConstInt = + dyn_cast(Const->getAggregateElement(EltIdx)); + if (!ConstInt) + return {}; + const unsigned ShAmt = ConstInt->getValue().getLimitedValue(); + if (ShAmt % Byte::BitWidth != 0) + return {}; + const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytesPerElt); + for (unsigned ByteIdx = ByteShAmt; ByteIdx < NumBytesPerElt; ++ByteIdx) + BV.emplace_back(BaseDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 0); + + BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0)); + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitTruncInst(TruncInst &I) { + const std::optional Layout = tryGetByteLayout(I.getType()); + if (!Layout) + return {}; + + const std::optional SrcLayout = + tryGetByteLayout(I.getOperand(0)->getType()); + if (!SrcLayout) + return {}; + + const ByteDefinition SrcDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!SrcDef) + return {}; + + ByteVector BV; + const unsigned NumBytes = Layout->getNumBytes(); + BV.reserve(NumBytes); + + const unsigned NumBytesPerElt = Layout->NumBytesPerElement; + const unsigned NumSrcBytesPerElt = SrcLayout->NumBytesPerElement; + for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx) + for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx) + BV.emplace_back(SrcDef.getByte(EltIdx * NumSrcBytesPerElt + ByteIdx), 0); + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitZExtInst(ZExtInst &I) { + const std::optional Layout = tryGetByteLayout(I.getType()); + if (!Layout) + return {}; + + const ByteDefinition SrcDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!SrcDef) + return {}; + + ByteVector BV; + const unsigned NumBytes = Layout->getNumBytes(); + BV.reserve(NumBytes); + + const unsigned NumSrcBytesPerElt = SrcDef.getLayout().NumBytesPerElement; + const unsigned NumZExtBytesPerElt = + Layout->NumBytesPerElement - NumSrcBytesPerElt; + + unsigned SrcIdx = 0; + for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx) { + for (unsigned ByteIdx = 0; ByteIdx < NumSrcBytesPerElt; ++SrcIdx, ++ByteIdx) + BV.emplace_back(SrcDef.getByte(SrcIdx), 0); + + BV.append(NumZExtBytesPerElt, ByteUse(Byte::zeroes(), 0)); + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitBitCastInst(BitCastInst &I) { + const std::optional Layout = tryGetByteLayout(I.getType()); + if (!Layout) + return {}; + + const ByteDefinition SrcDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!SrcDef) + return {}; + + const unsigned NumBytes = Layout->getNumBytes(); + ByteVector BV; + BV.reserve(NumBytes); + for (unsigned Idx = 0; Idx < NumBytes; ++Idx) + BV.emplace_back(SrcDef.getByte(Idx), 0); + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitExtractElementInst(ExtractElementInst &I) { + const ByteDefinition VecDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!VecDef) + return {}; + + const auto *VecIdx = dyn_cast(I.getOperand(1)); + if (!VecIdx) + return {}; + + const unsigned NumBytes = VecDef.getLayout().NumBytesPerElement; + const unsigned ByteOffset = VecIdx->getLimitedValue() * NumBytes; + ByteVector BV; + BV.reserve(NumBytes); + + for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) + BV.emplace_back(VecDef.getByte(ByteIdx + ByteOffset), 0); + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitInsertElementInst(InsertElementInst &I) { + const ByteDefinition VecDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!VecDef) + return {}; + + const ByteDefinition EltDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(1)); + if (!EltDef) + return {}; + + const auto *VecIdx = dyn_cast(I.getOperand(2)); + if (!VecIdx) + return {}; + + const unsigned NumBytes = VecDef.getLayout().getNumBytes(); + const unsigned NumBytesPerElt = VecDef.getLayout().NumBytesPerElement; + + ByteVector BV; + BV.reserve(NumBytes); + for (unsigned EltIdx = 0; EltIdx < VecDef.getLayout().NumVecElements; + ++EltIdx) { + if (EltIdx == VecIdx->getLimitedValue()) { + for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx) + BV.emplace_back(EltDef.getByte(ByteIdx), 0); + } else { + for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx) + BV.emplace_back(VecDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 1); + } + } + + assert(BV.size() == NumBytes); + return BV; +} + +ByteVector ByteExpander::visitShuffleVectorInst(ShuffleVectorInst &I) { + const std::optional Layout = tryGetByteLayout(I.getType()); + if (!Layout) + return {}; + + const ByteDefinition LhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + const ByteDefinition RhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(1)); + if (!LhsDef || !RhsDef) + return {}; + + const int LhsSize = LhsDef.getLayout().NumVecElements; + const unsigned NumBytes = Layout->getNumBytes(); + const unsigned NumBytesPerElt = Layout->NumBytesPerElement; + + ByteVector BV; + BV.reserve(NumBytes); + + for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx) { + const int Idx = I.getMaskValue(EltIdx); + if (Idx < 0) { + auto *Poison = PoisonValue::get(I.getType()->getElementType()); + for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx) + BV.emplace_back(Byte(*Poison, 0), ByteIdx); + } else if (Idx < LhsSize) { + for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx) + BV.emplace_back( + LhsDef.getByte(static_cast(Idx) * NumBytesPerElt + + ByteIdx), + 0); + } else { + for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx) + BV.emplace_back(RhsDef.getByte(static_cast(Idx - LhsSize) * + NumBytesPerElt + + ByteIdx), + 1); + } + } + + assert(BV.size() == NumBytes); + return BV; +} + +void ByteExpander::updateFinalUsers(Value *V) { + assert(!isa(V)); + + // FIXME: Old users are copied because iterator is potentially invalidated by + // intermediacy checks. + DenseSet OldFinalUsers = getFinalUsers(V); + + DenseSet NewFinalUsers; + for (Value *User : OldFinalUsers) { + if (!Definitions.contains(User) || !checkIfIntermediate(User)) { + NewFinalUsers.insert(User); + continue; + } + const DenseSet &NestedUses = getFinalUsers(User); + NewFinalUsers.insert_range(NestedUses); + } + + FinalUsers[V] = std::move(NewFinalUsers); +} + +const DenseSet &ByteExpander::getFinalUsers(Value *V) { + assert(!isa(V)); + + auto It = FinalUsers.find(V); + + if (It != FinalUsers.end()) + return It->getSecond(); + + DenseSet &Uses = FinalUsers[V]; + for (Use &U : V->uses()) + Uses.insert(U.getUser()); + + return Uses; +} + +ByteVector *ByteExpander::expandByteDefinition(Value *V) { + auto It = Definitions.find(V); + if (It == Definitions.end()) + return nullptr; + return &It->getSecond(); +} + +ByteDefinition ByteExpander::getByteDefinition(Value *V, bool ExpandDef) { + const std::optional Layout = tryGetByteLayout(V->getType()); + if (!Layout) + return ByteDefinition::invalid(); + + if (auto *ConstInt = dyn_cast(V)) + return ByteDefinition::constInt(*ConstInt); + if (auto *Const = dyn_cast(V)) + if (Const->getType()->isVectorTy()) + return ByteDefinition::constVec(*Const); + + if (ExpandDef) + if (ByteVector *BV = expandByteDefinition(V)) + return ByteDefinition::vector(*BV, *Layout); + + return ByteDefinition::value(*V); +} + +ByteDefinition ByteExpander::getByteDefinitionIfIntermediateOperand(Value *V) { + return getByteDefinition(V, checkIfIntermediate(V, true)); +} + +bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) { + if (isa(V)) + return true; + + /// Short-circuit check. + if (IsOperand && V->hasOneUse()) + return true; + + const DenseSet &FU = getFinalUsers(V); + if (FU.size() != 1) + return false; + + return Definitions.contains(*FU.begin()); +} + +std::vector ByteExpander::collectPIICandidates(Function &F) { + std::vector PackedIntInsts; + LLVM_DEBUG(dbgs() << "PICP: Entering function " << F.getName() << "\n"); + + unsigned NumIterations = 1; + for (;;) { + LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << "\n"); + bool Converged = true; + + std::vector CollectedInsts; + SetVector WorkList; + + for (BasicBlock *BB : ReversePostOrderTraversal(&F)) { + for (Instruction &I : *BB) { + ByteVector BV = visit(I); + if (BV.empty()) + continue; + + CollectedInsts.push_back(&I); + + ByteVector &Def = Definitions[&I]; + if (Def == BV) + continue; + + Converged = false; + Def = std::move(BV); + for (ByteUse &BU : Def) { + const Byte &B = BU.getByte(); + if (!B.isConstant() && !isa(B.getBase())) + WorkList.insert(B.getBase()); + } + + WorkList.insert(&I); + + LLVM_DEBUG({ + dbgs() << "PICP: Updating definition: "; + I.printAsOperand(dbgs()); + dbgs() << " = " << getByteDefinition(&I) << "\n"; + }); + } + } + + PackedIntInsts.swap(CollectedInsts); + + if (Converged) { + LLVM_DEBUG(dbgs() << "PICP: Reached fixpoint\n"); + break; + } + if (NumIterations == MaxCollectionIterations) { + LLVM_DEBUG(dbgs() << "PICP: Reached maximum iteration limit\n"); + break; + } + + // Update final uses of values before their operands. + for (auto RI = WorkList.rbegin(); RI != WorkList.rend(); ++RI) + updateFinalUsers(*RI); + ++NumIterations; + } + + LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << "\n"); + return PackedIntInsts; +} + +/// Return the value of all bits in a range, or std::nullopt if the bits vary. +static std::optional checkAllBits(const SmallBitVector &Mask, unsigned Lo, + unsigned NumBits) { + bool Bit = Mask[Lo]; + for (unsigned Idx = 1; Idx < NumBits; ++Idx) + if (Mask[Lo + Idx] != Bit) + return std::nullopt; + return Bit; +} + +/// Structure for tracking the set of bytes of a final value which are produced +/// by a given byte pack. +struct PartialBytePack { + + /// The value which produces the subset of bytes of a final value. + /// The byte pack is invalid if this pointer is null. + Value *BytePack; + /// A mask which identifies which bytes of a final value are provided by the + /// given byte pack. If a mask bit is not set, then the corresponding byte of + /// the byte pack must be zero. + SmallBitVector SetBytes; + + PartialBytePack(Value *BytePack, SmallBitVector SetBytes) + : BytePack(BytePack), SetBytes(SetBytes) {} + PartialBytePack(Value *BytePack, unsigned NumBytes) + : BytePack(BytePack), SetBytes(NumBytes) {} + + static PartialBytePack invalid() { return {nullptr, {}}; } + + bool isValid() const { return BytePack != nullptr; } +}; + +/// Construct an integer whose bytes are set depending on the value of the +/// corresponding \p Mask bit. A bit of \p Mask corresponds to an entire byte of +/// the resulting APInt. +static APInt createMaskConstant(unsigned BitWidth, const SmallBitVector &Mask) { + APInt BitMaskInt(BitWidth, 0); + for (unsigned ByteIdx : Mask.set_bits()) { + const unsigned BitIdx = ByteIdx * Byte::BitWidth; + if (BitIdx >= BitWidth) + break; + BitMaskInt.setBits(BitIdx, BitIdx + Byte::BitWidth); + } + return BitMaskInt; +} + +/// Construct a mask whose bits correspond to vector elements identified by the +/// \p ByteMask, or an empty vector if the \p Bytemask does not identify whole +/// vector elements. +static SmallBitVector getVectorElementMask(SmallBitVector &ByteMask, + unsigned NumBytesPerElement) { + if (ByteMask.size() % NumBytesPerElement != 0) + return {}; + const unsigned NumElts = ByteMask.size() / NumBytesPerElement; + + SmallBitVector EltMask; + EltMask.reserve(NumElts); + for (unsigned EltIdx = 0; EltIdx < NumElts; ++EltIdx) { + const std::optional Bits = + checkAllBits(ByteMask, EltIdx * NumBytesPerElement, NumBytesPerElement); + if (!Bits) + return {}; + EltMask.push_back(*Bits); + } + + assert(EltMask.size() == NumElts); + return EltMask; +} + +/// A key for the CastCache of the BytePackFolder. +struct CastEntry { + /// The value being casted. + Value *Base; + /// The type being casted into. + Type *CastTy; + /// The opcode of the cast instruction. + Instruction::CastOps OpCode; + + struct MapInfo { + static CastEntry getEmptyKey() { + return {DenseMapInfo::getEmptyKey(), + DenseMapInfo::getEmptyKey(), + DenseMapInfo::getEmptyKey()}; + } + static CastEntry getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey()}; + } + static unsigned getHashValue(const CastEntry &E) { + return hash_combine( + DenseMapInfo::getHashValue(E.Base), + DenseMapInfo::getHashValue(E.CastTy), + DenseMapInfo::getHashValue(E.OpCode)); + } + static bool isEqual(const CastEntry &Lhs, const CastEntry &Rhs) { + return Lhs.Base == Rhs.Base && Lhs.CastTy == Rhs.CastTy && + Lhs.OpCode == Rhs.OpCode; + } + }; +}; + +/// The class responsible for taking coalesced bytes and folding them together +/// to produce the desired final value. +/// +/// When coalesced bytes are pushed, they are promoted to the target type, and +/// shifted to align the bytes to their corresponding offsets in the target +/// value. +class BytePackFolder { + /// The target final value to produce. + Instruction *TargetInst; + /// The layout of the target value. + ByteLayout Layout; + /// The collection of intermediate partial byte packs generated while folding + /// coalesced bytes. + std::vector WorkList; + /// The list of non-cast instructions generated while folding coalesced bytes. + SmallVector Insts; + /// A dedicated partial byte pack for collecting vector-aligned coalesced + /// bytes, if the target value is a vector type. + PartialBytePack VectorAlignedPack; + /// A cache holding all value casts needed, to avoid generating duplicate + /// casts. + MapVector> + CastCache; + + /// Create or reuse a cast of a given value. + Value *pushCast(Instruction::CastOps OpCode, Value *V, Type *DstTy) { + if (V->getType() == DstTy) + return V; + + CastEntry E{V, DstTy, OpCode}; + auto *It = CastCache.find(E); + if (It != CastCache.end()) + return It->second; + + auto *CI = CastInst::Create(OpCode, V, DstTy, V->getName() + ".cast"); + CastCache[E] = CI; + + LLVM_DEBUG({ + dbgs() << "PICP ["; + TargetInst->printAsOperand(dbgs()); + dbgs() << "]: Queuing cast " << *CI << "\n"; + }); + return CI; + } + + Instruction *pushInst(Instruction *I) { + // Cast instructions should be handled with pushCast. + assert(!isa(I)); + Insts.push_back(I); + + LLVM_DEBUG({ + dbgs() << "PICP ["; + TargetInst->printAsOperand(dbgs()); + dbgs() << "]: Queuing inst " << *I << "\n"; + }); + return I; + } + + /// Common functionality for promoting coalesced bytes to a vector. + bool pushToVectorImpl(Value *V, SmallBitVector &ByteMask, unsigned NumSrcElts, + int ShrEltOffset, const Twine &Name) { + auto *TargetVecTy = cast(TargetInst->getType()); + auto *I32Ty = IntegerType::getInt32Ty(V->getContext()); + + // Try to push bytes to the vector-aligned builder. + SmallBitVector VecMask = + getVectorElementMask(ByteMask, Layout.NumBytesPerElement); + if (!VecMask.empty()) { + if (!VectorAlignedPack.isValid()) + VectorAlignedPack = PartialBytePack( + ConstantVector::getNullValue(TargetVecTy), Layout.getNumBytes()); + + if (NumSrcElts == 1) { + // Insert a single element + assert(ShrEltOffset <= 0); + VectorAlignedPack.BytePack = pushInst(InsertElementInst::Create( + VectorAlignedPack.BytePack, V, + ConstantInt::get(I32Ty, -ShrEltOffset), Name + ".insert")); + VectorAlignedPack.SetBytes |= ByteMask; + return true; + } + + assert(isa(V->getType())); + + if (NumSrcElts != Layout.NumVecElements) { + // We need to construct a vector of the same size as the vector-aligned + // byte pack before shuffling it in. + SmallVector ExtractMask; + ExtractMask.reserve(Layout.NumVecElements); + for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) { + if (VecMask.test(EltIdx)) { + const int SrcIdx = static_cast(EltIdx) + ShrEltOffset; + assert(SrcIdx >= 0); + ExtractMask.push_back(SrcIdx); + } else + ExtractMask.push_back(PoisonMaskElem); + } + assert(ExtractMask.size() == Layout.NumVecElements); + + V = pushInst(new ShuffleVectorInst(V, ExtractMask, Name + ".extract")); + // We have accounted for the shift already, so no need to account for it + // when shuffling into the vector-aligned byte pack. + ShrEltOffset = 0; + } + + assert(V->getType() == TargetVecTy); + + if (VecMask.all()) { + VectorAlignedPack.BytePack = V; + VectorAlignedPack.SetBytes.set(); + return true; + } + + SmallVector ShuffleMask; + ShuffleMask.reserve(Layout.NumVecElements); + for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) { + if (VecMask.test(EltIdx)) { + const int SrcIdx = static_cast(EltIdx) + ShrEltOffset; + assert(SrcIdx >= 0); + ShuffleMask.push_back(SrcIdx); + } else + ShuffleMask.push_back(EltIdx + Layout.NumVecElements); + } + assert(ShuffleMask.size() == Layout.NumVecElements); + + // We can shuffle directly into the vector-aligned byte pack. + VectorAlignedPack.BytePack = pushInst(new ShuffleVectorInst( + V, VectorAlignedPack.BytePack, ShuffleMask, Name + ".shuffle")); + VectorAlignedPack.SetBytes |= ByteMask; + return true; + } + + // Otherwise, just extract and mask the relevant elements, and append to the + // worklist. + + if (NumSrcElts == 1) { + assert(ShrEltOffset <= 0); + V = pushInst(InsertElementInst::Create( + ConstantVector::getNullValue(TargetVecTy), V, + ConstantInt::get(I32Ty, -ShrEltOffset), Name + ".insert")); + } else if (NumSrcElts != Layout.NumVecElements) { + SmallVector ShuffleMask; + ShuffleMask.reserve(Layout.NumVecElements); + ShuffleMask.append(std::max(0, -ShrEltOffset), Layout.NumVecElements); + for (unsigned SrcIdx = std::max(0, ShrEltOffset); + SrcIdx < Layout.NumVecElements; ++SrcIdx) + ShuffleMask.push_back(SrcIdx); + ShuffleMask.append(std::max(0, ShrEltOffset), Layout.NumVecElements); + assert(ShuffleMask.size() == Layout.NumVecElements); + + V = pushInst( + new ShuffleVectorInst(V, ConstantVector::getNullValue(V->getType()), + ShuffleMask, Name + ".shuffle")); + } + + assert(V->getType() == TargetVecTy); + + const unsigned TargetBitWidth = Layout.getNumBytes() * Byte::BitWidth; + const unsigned TargetEltBitWidth = + Layout.NumBytesPerElement * Byte::BitWidth; + Type *TargetEltTy = TargetVecTy->getElementType(); + + APInt MaskBits = createMaskConstant(TargetBitWidth, ByteMask); + SmallVector EltwiseMask; + EltwiseMask.reserve(Layout.NumVecElements); + for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) + EltwiseMask.push_back(ConstantInt::get( + TargetEltTy, + MaskBits.extractBits(TargetEltBitWidth, EltIdx * TargetEltBitWidth))); + + V = pushInst(BinaryOperator::CreateAnd(V, ConstantVector::get(EltwiseMask), + Name + ".mask")); + + WorkList.emplace_back(V, ByteMask); + return true; + } + + bool pushIntegerToInteger(CoalescedBytes CB) { + assert(isa(CB.Base->getType())); + auto *TargetIntTy = cast(TargetInst->getType()); + + const unsigned NumTargetBytes = Layout.getNumBytes(); + Value *V = CB.Base; + const unsigned NumSrcBytes = getByteLayout(V->getType()).getNumBytes(); + const StringRef &Name = V->getName(); + + // Transformation: shr -> trunc -> mask -> zext -> shl + if (const unsigned ShrAmt = CB.getShrBits()) + V = pushInst(BinaryOperator::CreateLShr( + V, ConstantInt::get(V->getType(), ShrAmt), Name + ".shift")); + + if (NumSrcBytes > NumTargetBytes) + V = pushCast(Instruction::Trunc, V, TargetIntTy); + + const unsigned ShlByteOffset = CB.getShlBytes(); + const unsigned NumBytesToCheck = std::min( + ShlByteOffset < NumTargetBytes ? NumTargetBytes - ShlByteOffset : 0, + CB.getShrBytes() < NumSrcBytes ? NumSrcBytes - CB.getShrBytes() : 0); + if (!checkAllBits(CB.Mask, ShlByteOffset, NumBytesToCheck)) { + SmallBitVector RelMask = CB.Mask; + RelMask >>= ShlByteOffset; + Constant *Mask = ConstantInt::get( + V->getType(), + createMaskConstant(V->getType()->getIntegerBitWidth(), RelMask)); + V = pushInst(BinaryOperator::CreateAnd(V, Mask, Name + ".mask")); + } + + if (NumSrcBytes < NumTargetBytes) + V = pushCast(Instruction::ZExt, V, TargetIntTy); + + if (const unsigned ShlAmt = CB.getShlBits()) + V = pushInst(BinaryOperator::CreateShl( + V, ConstantInt::get(V->getType(), ShlAmt), Name + ".shift")); + + WorkList.emplace_back(V, CB.Mask); + return true; + } + + bool pushIntegerToVector(CoalescedBytes CB) { + assert(isa(CB.Base->getType())); + auto *TargetVecTy = cast(TargetInst->getType()); + Type *TargetEltTy = TargetVecTy->getElementType(); + + Value *V = CB.Base; + const unsigned NumSrcBytes = + V->getType()->getIntegerBitWidth() / Byte::BitWidth; + const StringRef &Name = V->getName(); + + // Give up if bytes are obtained from a strange offset. + if (CB.SignedShrByteOffset % Layout.NumBytesPerElement != 0) + return {}; + const int ShrEltOffset = + CB.SignedShrByteOffset / static_cast(Layout.NumBytesPerElement); + + // Give up if the source integer does not decompose naturally into vector + // elements. + if (NumSrcBytes % Layout.NumBytesPerElement != 0) + return {}; + const unsigned NumSrcElts = NumSrcBytes / Layout.NumBytesPerElement; + + if (NumSrcElts > 1) { + auto *CastTy = FixedVectorType::get(TargetEltTy, NumSrcElts); + V = pushCast(Instruction::BitCast, V, CastTy); + } + + return pushToVectorImpl(V, CB.Mask, NumSrcElts, ShrEltOffset, Name); + } + + bool pushVectorToInteger(CoalescedBytes CB) { + assert(isa(CB.Base->getType())); + auto *TargetIntTy = cast(TargetInst->getType()); + + const unsigned NumTargetBytes = Layout.getNumBytes(); + Value *V = CB.Base; + const StringRef &Name = V->getName(); + ByteLayout VecLayout = getByteLayout(V->getType()); + + // For sub-element accesses, try to subdivide the vector into smaller + // elements. + if (VecLayout.NumBytesPerElement > NumTargetBytes) { + if (VecLayout.NumBytesPerElement % NumTargetBytes != 0) + return {}; + + const unsigned SplitFactor = + VecLayout.NumBytesPerElement / NumTargetBytes; + auto *NewTy = FixedVectorType::get(TargetIntTy, VecLayout.NumVecElements * + SplitFactor); + V = pushCast(Instruction::BitCast, V, NewTy); + VecLayout = getByteLayout(V->getType()); + } + + // Give up if bytes are obtained from a strange offset. + if (CB.SignedShrByteOffset % VecLayout.NumBytesPerElement != 0) + return {}; + + int ShrEltOffset = + CB.SignedShrByteOffset / static_cast(VecLayout.NumBytesPerElement); + + // Give up if the target integer does not decompose naturally into vector + // elements. + if (NumTargetBytes % VecLayout.NumBytesPerElement != 0) + return {}; + const unsigned NumTargetElts = + NumTargetBytes / VecLayout.NumBytesPerElement; + + auto *I32Ty = IntegerType::getInt32Ty(V->getContext()); + + // Coarsely isolate elements of interest, and use a bitmask to clean up the + // rest. + const bool NeedsBitMask = [&] { + if (NumTargetElts == 1) { + // Extract the unique relevant element + const int ExtractIdx = ShrEltOffset; + assert(ExtractIdx >= 0); + V = pushInst(ExtractElementInst::Create( + V, ConstantInt::get(I32Ty, ExtractIdx), Name + ".extract")); + ShrEltOffset = 0; + return !CB.Mask.all(); + } + + if (NumTargetElts != VecLayout.NumVecElements) { + bool IsVectorAligned = true; + + // Extract all relevant elements into a shufflevector + SmallVector ShuffleMask; + ShuffleMask.reserve(NumTargetElts); + + for (unsigned EltIdx = 0; EltIdx < NumTargetElts; ++EltIdx) { + const std::optional EltMask = + checkAllBits(CB.Mask, EltIdx * VecLayout.NumBytesPerElement, + VecLayout.NumBytesPerElement); + + IsVectorAligned &= EltMask.has_value(); + if (!EltMask || *EltMask) { + const int ExtractIdx = static_cast(EltIdx) + ShrEltOffset; + assert(ExtractIdx >= 0); + ShuffleMask.push_back(ExtractIdx); + } else { + ShuffleMask.push_back(VecLayout.NumVecElements); + } + } + + V = pushInst( + new ShuffleVectorInst(V, ConstantVector::getNullValue(V->getType()), + ShuffleMask, Name + ".shuffle")); + V = pushCast(Instruction::BitCast, V, TargetIntTy); + + ShrEltOffset = 0; + return !IsVectorAligned; + } + + V = pushCast(Instruction::BitCast, V, TargetIntTy); + return !CB.Mask.all(); + }(); + + assert(V->getType() == TargetIntTy); + + const int ShrBitOffset = ShrEltOffset * + static_cast(VecLayout.NumBytesPerElement) * + static_cast(Byte::BitWidth); + if (ShrBitOffset > 0) + V = pushInst(BinaryOperator::CreateLShr( + V, ConstantInt::get(V->getType(), ShrBitOffset), Name + ".shift")); + else if (ShrBitOffset < 0) + V = pushInst(BinaryOperator::CreateShl( + V, ConstantInt::get(V->getType(), -ShrBitOffset), Name + ".shift")); + + if (NeedsBitMask) { + // Mask out unwanted bytes. + Constant *Mask = ConstantInt::get( + TargetIntTy, createMaskConstant(TargetIntTy->getBitWidth(), CB.Mask)); + V = pushInst(BinaryOperator::CreateAnd(V, Mask, Name + ".mask")); + } + + WorkList.emplace_back(V, CB.Mask); + return true; + } + + bool pushVectorToVector(CoalescedBytes CB) { + assert(isa(CB.Base->getType())); + auto *TargetVecTy = cast(TargetInst->getType()); + Type *TargetEltTy = TargetVecTy->getElementType(); + + const ByteLayout SrcLayout = getByteLayout(CB.Base->getType()); + Value *V = CB.Base; + const StringRef &Name = V->getName(); + + // Give up if the source vector cannot be converted to match the elements of + // the target vector. + if (SrcLayout.getNumBytes() % Layout.NumBytesPerElement != 0) + return {}; + const unsigned NumSrcElts = + SrcLayout.getNumBytes() / Layout.NumBytesPerElement; + + // Give up if the shift amount is not aligned to the target vector. + if (CB.SignedShrByteOffset % Layout.NumBytesPerElement != 0) + return {}; + + const int ShrEltOffset = + CB.SignedShrByteOffset / static_cast(Layout.NumBytesPerElement); + + Type *SrcTy; + if (NumSrcElts > 1) + SrcTy = FixedVectorType::get(TargetEltTy, NumSrcElts); + else + SrcTy = TargetEltTy; + + V = pushCast(Instruction::BitCast, V, SrcTy); + + return pushToVectorImpl(V, CB.Mask, NumSrcElts, ShrEltOffset, Name); + } + + PartialBytePack mergeIntegerPacks(PartialBytePack &Lhs, + PartialBytePack &Rhs) { + assert(isa(Lhs.BytePack->getType()) && + isa(Rhs.BytePack->getType())); + Value *Merge = pushInst(BinaryOperator::CreateDisjointOr( + Lhs.BytePack, Rhs.BytePack, TargetInst->getName() + ".merge", nullptr)); + return {Merge, Lhs.SetBytes | Rhs.SetBytes}; + } + + PartialBytePack mergeVectorPacks(PartialBytePack &Lhs, PartialBytePack &Rhs) { + assert(isa(Lhs.BytePack->getType()) && + isa(Rhs.BytePack->getType())); + SmallVector ShuffleMask; + ShuffleMask.reserve(Layout.NumVecElements); + for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) { + const unsigned Lo = EltIdx * Layout.NumBytesPerElement; + const std::optional LhsBits = + checkAllBits(Lhs.SetBytes, Lo, Layout.NumBytesPerElement); + if (!LhsBits) { + const std::optional RhsBits = + checkAllBits(Rhs.SetBytes, Lo, Layout.NumBytesPerElement); + if (!RhsBits) { + ShuffleMask.clear(); + break; + } + ShuffleMask.push_back(*RhsBits ? EltIdx + Layout.NumVecElements + : EltIdx); + continue; + } + + ShuffleMask.push_back(*LhsBits ? EltIdx : EltIdx + Layout.NumVecElements); + } + + const Twine &Name = TargetInst->getName() + ".merge"; + Value *Merge; + if (ShuffleMask.empty()) + Merge = pushInst(BinaryOperator::CreateDisjointOr( + Lhs.BytePack, Rhs.BytePack, Name, nullptr)); + else + Merge = pushInst( + new ShuffleVectorInst(Lhs.BytePack, Rhs.BytePack, ShuffleMask, Name)); + + return {Merge, Lhs.SetBytes | Rhs.SetBytes}; + } + +public: + BytePackFolder(Instruction *TargetV) + : TargetInst(TargetV), Layout(getByteLayout(TargetV->getType())), + VectorAlignedPack(PartialBytePack::invalid()) {} + + ~BytePackFolder() { + /// If instructions are not committed, they need to be cleaned up. + + for (auto &[_, I] : CastCache) { + LLVM_DEBUG({ + dbgs() << "PICP ["; + TargetInst->printAsOperand(dbgs()); + dbgs() << "]: Dequeuing cast " << *I << "\n"; + }); + I->replaceAllUsesWith(PoisonValue::get(I->getType())); + I->deleteValue(); + } + + while (!Insts.empty()) { + LLVM_DEBUG({ + dbgs() << "PICP ["; + TargetInst->printAsOperand(dbgs()); + dbgs() << "]: Dequeuing inst " << *Insts.back() << "\n"; + }); + Insts.back()->deleteValue(); + Insts.pop_back(); + } + } + + /// Try to generate instructions for coalescing the given bytes and aligning + /// them to the target value. Returns true iff this is successful. + bool pushCoalescedBytes(CoalescedBytes CB) { + if (isa(CB.Base) && CB.SignedShrByteOffset == 0) { + WorkList.emplace_back(CB.Base, CB.Mask); + return true; + } + + LLVM_DEBUG({ + dbgs() << "PICP ["; + TargetInst->printAsOperand(dbgs()); + dbgs() << "]: Preparing bytes " << CB << "\n"; + }); + if (isa(TargetInst->getType())) { + if (isa(CB.Base->getType())) + return pushVectorToVector(CB); + + return pushIntegerToVector(CB); + } + + if (isa(CB.Base->getType())) + return pushVectorToInteger(CB); + + return pushIntegerToInteger(CB); + } + + /// After coalescing all byte packs individually, this folds the coalesced + /// byte packs together to (re)produce the final value and return it. + Value *foldBytePacks(IRBuilder<> &IRB) { + Type *TargetTy = TargetInst->getType(); + + if (VectorAlignedPack.isValid()) { + WorkList.push_back(VectorAlignedPack); + VectorAlignedPack = PartialBytePack::invalid(); + } + + while (WorkList.size() > 1) { + std::vector NewWorkList; + NewWorkList.reserve((WorkList.size() + 1) / 2); + + for (unsigned Item = 0; Item + 1 < WorkList.size(); Item += 2) { + PartialBytePack &Lhs = WorkList[Item]; + PartialBytePack &Rhs = WorkList[Item + 1]; + NewWorkList.push_back(isa(TargetTy) + ? mergeVectorPacks(Lhs, Rhs) + : mergeIntegerPacks(Lhs, Rhs)); + } + if (WorkList.size() % 2 == 1) + NewWorkList.push_back(WorkList.back()); + + WorkList.swap(NewWorkList); + } + + IRB.SetInsertPoint(TargetInst); + for (Value *I : Insts) + IRB.Insert(I, I->getName()); + + Insts.clear(); + + for (auto &[E, I] : CastCache) { + if (auto *BaseI = dyn_cast(E.Base)) + I->insertInto(BaseI->getParent(), *BaseI->getInsertionPointAfterDef()); + else { + BasicBlock &BB = TargetInst->getFunction()->getEntryBlock(); + I->insertInto(&BB, BB.getFirstInsertionPt()); + } + } + + CastCache.clear(); + + // Note: WorkList may be empty if the value is known to be zero. + return WorkList.empty() ? Constant::getNullValue(TargetTy) + : WorkList.back().BytePack; + } +}; + +/// A final value (or an operand thereof, if the rewriter is not aggressive) +/// queued up to be reconstructed. +struct PackedIntInstruction { + /// The target value to reconstruct. + Instruction *TargetInst; + /// The chosen partitioning of its bytes. + SmallVector CBV; + + PackedIntInstruction(Instruction &I, SmallVector &&CBV) + : TargetInst(&I), CBV(CBV) {} + + /// Try to reconstruct a value given its coalesced byte partitioning, + /// returning the reconstructed value on success, or a nullptr on failure. + Value *rewrite(IRBuilder<> &IRB) const { + BytePackFolder BPF(TargetInst); + for (const CoalescedBytes &CB : CBV) { + if (!BPF.pushCoalescedBytes(CB)) { + LLVM_DEBUG(dbgs() << "PICP: Coalescing rejected!\n"); + return nullptr; + } + } + + return BPF.foldBytePacks(IRB); + } +}; + +/// Coalesce the bytes in a definition into a partition for rewriting. +/// If the rewriter is non-aggressive, return nullopt if the rewriting is +/// determined to be unnecessary. +static std::optional> +getCoalescingOpportunity(Type *Ty, const ByteVector &BV) { + const ByteLayout Layout = getByteLayout(Ty); + assert(Layout.getNumBytes() == BV.size() && + "Byte definition has unexpected width."); + + SmallVector CBV; + SmallVector CBVOperands; + const unsigned BitWidth = Layout.getNumBytes() * Byte::BitWidth; + APInt ConstBits(BitWidth, 0); + SmallBitVector ConstBytes(BV.size()); + + bool OperandsAlreadyCoalesced = true; + bool UsesSingleSource = true; + for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx) { + const ByteUse &BU = BV[ByteIdx]; + const Byte &B = BU.getByte(); + if (B.isConstant()) { + const unsigned Const = B.getConstant(); + if (!Const) + continue; + + ConstBits.insertBits(Const, ByteIdx * Byte::BitWidth, Byte::BitWidth); + ConstBytes.set(ByteIdx); + } else { + CoalescedBytes *CB = nullptr; + Value *Base = B.getBase(); + const signed Offset = + static_cast(B.getIndex()) - static_cast(ByteIdx); + for (unsigned CBIdx = 0; CBIdx < CBV.size(); ++CBIdx) { + if (CBV[CBIdx].alignsWith(Base, Offset)) { + CB = &CBV[CBIdx]; + int &OpIdx = CBVOperands[CBIdx]; + if (OpIdx < 0) + OpIdx = BU.getOperandIndex(); + else if (BU.getOperandIndex() >= 0 && OpIdx != BU.getOperandIndex()) { + LLVM_DEBUG(dbgs() + << "PICP: Bytes " << *CB << " from operand " << OpIdx + << " can be coalesced with byte " << B + << " from operand " << BU.getOperandIndex() << "\n"); + OperandsAlreadyCoalesced = false; + } + } + } + + if (!CB) { + CB = &CBV.emplace_back(*Base, Offset, BV.size()); + CBVOperands.push_back(BU.getOperandIndex()); + } + + UsesSingleSource &= CB->Base == CBV.front().Base; + CB->Mask.set(ByteIdx); + } + } + + if (!AggressiveRewriting) { + if (OperandsAlreadyCoalesced && !CBV.empty()) { + // If packed bytes from the same source and offset are not split between + // operands, then this instruction does not need to be rewritten. + LLVM_DEBUG(dbgs() << "PICP: Operands are already coalesced.\n"); + return std::nullopt; + } + if (UsesSingleSource && CBV.size() > 1) { + // If packed bytes come from the same source, but cannot be coalesced + // (e.g., bytes from one operand are shuffled), then rewriting this + // instruction may lead to strange IR. + LLVM_DEBUG( + dbgs() + << "PICP: Instruction rearranges bytes from a single source.\n"); + return std::nullopt; + } + } + + // The CBV will be used for rewriting; append the constant value that was also + // accumulated, if nonzero. + if (ConstBytes.any()) { + // Create initial constant as desired type. + if (auto *VecTy = dyn_cast(Ty)) { + SmallVector EltwiseMask; + const unsigned NumBitsPerElt = Layout.NumBytesPerElement * Byte::BitWidth; + EltwiseMask.reserve(Layout.NumVecElements); + for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) + EltwiseMask.push_back(ConstantInt::get( + VecTy->getElementType(), + ConstBits.extractBits(NumBitsPerElt, EltIdx * NumBitsPerElt))); + + CBV.emplace_back(*ConstantVector::get(EltwiseMask), 0, ConstBytes); + } else + CBV.emplace_back(*ConstantInt::get(Ty, ConstBits), 0, ConstBytes); + } + + return CBV; +} + +/// Queue into \p PIIV the set of final values (or operands thereof, if the +/// rewriter is non-aggressive) which are deemed beneficial to rewrite. +static void queueRewriting(std::vector &PIIV, + Instruction &FinalInst, ByteExpander &BE) { + SmallVector WorkList{&FinalInst}; + SmallPtrSet Seen{&FinalInst}; + + do { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + const ByteVector *BV = BE.expandByteDefinition(I); + if (!BV) + // This instruction is beyond the analysis scope of PICP. + continue; + + LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << "\n" + << " byte pack: " << BE.getByteDefinition(I) + << "\n"); + auto CBV = [&]() -> std::optional> { + // Short-circuit check for casts. + if (!AggressiveRewriting && I->getNumOperands() == 1) + return std::nullopt; + + return getCoalescingOpportunity(I->getType(), *BV); + }(); + + if (!CBV) { + // Narrow rewriting to the operands of this instruction instead. + for (Use &U : I->operands()) + if (auto *Op = dyn_cast(U.get())) + if (Seen.insert(Op).second) + WorkList.push_back(Op); + continue; + } + + PIIV.emplace_back(*I, std::move(*CBV)); + } while (!WorkList.empty()); +} + +static bool runImpl(Function &F) { + ByteExpander BE; + + std::vector PIICandidates = BE.collectPIICandidates(F); + std::vector PIIV; + + for (Instruction *I : PIICandidates) { + if (!BE.checkIfIntermediate(I)) + queueRewriting(PIIV, *I, BE); + else + LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << "\n" + << " final user: " + << **BE.getFinalUsers(I).begin() << "\n"); + } + + DenseMap InstSubs; + IRBuilder<> IRB(F.getContext()); + for (const PackedIntInstruction &PII : PIIV) + if (Value *V = PII.rewrite(IRB)) { + LLVM_DEBUG(dbgs() << "PICP rewrite successful for " << *PII.TargetInst + << "\n"); + InstSubs[PII.TargetInst] = V; + } + + if (InstSubs.empty()) + return false; + + for (auto &[OldI, NewV] : InstSubs) + OldI->replaceAllUsesWith(NewV); + + for (auto RIt = PIICandidates.rbegin(); RIt != PIICandidates.rend(); ++RIt) { + Instruction *I = *RIt; + if (I->getNumUses() == 0) + I->eraseFromParent(); + } + return true; +} + +class PackedIntegerCombineLegacyPass : public FunctionPass { +public: + static char ID; + + PackedIntegerCombineLegacyPass() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override { return runImpl(F); } +}; +char PackedIntegerCombineLegacyPass::ID = 0; + +} // namespace + +PreservedAnalyses PackedIntegerCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + if (!runImpl(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} + +INITIALIZE_PASS(PackedIntegerCombineLegacyPass, DEBUG_TYPE, + "Packed Integer Combine", false, false) + +FunctionPass *llvm::createPackedIntegerCombinePass() { + return new PackedIntegerCombineLegacyPass(); +} diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index c7e4a3e824700..2c2b8964af8d5 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -36,6 +36,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerAtomicLegacyPassPass(Registry); initializeMergeICmpsLegacyPassPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); + initializePackedIntegerCombineLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry); diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index 93b5f155fc81e..2cd541363d44d 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -205,12 +205,12 @@ define i64 @load_3xi16_combine(ptr addrspace(1) %p) #0 { ; GCN-LABEL: load_3xi16_combine: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: global_load_ushort v3, v[0:1], off offset:4 +; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1 %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index af3241e95e91d..801712ba90988 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -541,6 +541,7 @@ ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis ; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer +; GCN-O1-OPTS-NEXT: Packed Integer Combine ; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis ; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches ; GCN-O1-OPTS-NEXT: Lower invoke and unwind, for unwindless code generators @@ -856,6 +857,7 @@ ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: GPU Load and Store Vectorizer +; GCN-O2-NEXT: Packed Integer Combine ; GCN-O2-NEXT: Lazy Value Information Analysis ; GCN-O2-NEXT: Lower SwitchInst's to branches ; GCN-O2-NEXT: Lower invoke and unwind, for unwindless code generators @@ -1186,6 +1188,7 @@ ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: GPU Load and Store Vectorizer +; GCN-O3-NEXT: Packed Integer Combine ; GCN-O3-NEXT: Lazy Value Information Analysis ; GCN-O3-NEXT: Lower SwitchInst's to branches ; GCN-O3-NEXT: Lower invoke and unwind, for unwindless code generators diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 38e45042b5ee4..d13f5d569aae6 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1867,27 +1867,9 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v3, s3 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s3 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_readfirstlane_b32 s0, v1 -; NOSDWA-NEXT: v_readfirstlane_b32 s1, v0 -; NOSDWA-NEXT: s_lshr_b32 s3, s1, 24 -; NOSDWA-NEXT: s_lshr_b32 s5, s0, 24 -; NOSDWA-NEXT: s_and_b32 s2, s1, 0xffff -; NOSDWA-NEXT: s_bfe_u32 s1, s1, 0x80010 -; NOSDWA-NEXT: s_and_b32 s4, s0, 0xffff -; NOSDWA-NEXT: s_bfe_u32 s0, s0, 0x80010 -; NOSDWA-NEXT: s_lshl_b32 s3, s3, 8 -; NOSDWA-NEXT: s_lshl_b32 s5, s5, 8 -; NOSDWA-NEXT: s_or_b32 s1, s1, s3 -; NOSDWA-NEXT: s_or_b32 s0, s0, s5 -; NOSDWA-NEXT: s_lshl_b32 s1, s1, 16 -; NOSDWA-NEXT: s_lshl_b32 s0, s0, 16 -; NOSDWA-NEXT: s_or_b32 s1, s2, s1 -; NOSDWA-NEXT: s_or_b32 s0, s4, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v0, s1 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s0 ; NOSDWA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; NOSDWA-NEXT: s_endpgm ; @@ -1898,85 +1880,21 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX89-NEXT: v_mov_b32_e32 v3, s3 ; GFX89-NEXT: v_mov_b32_e32 v2, s2 +; GFX89-NEXT: v_mov_b32_e32 v3, s3 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_readfirstlane_b32 s0, v1 -; GFX89-NEXT: v_readfirstlane_b32 s1, v0 -; GFX89-NEXT: s_lshr_b32 s3, s1, 24 -; GFX89-NEXT: s_lshr_b32 s5, s0, 24 -; GFX89-NEXT: s_and_b32 s2, s1, 0xffff -; GFX89-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX89-NEXT: s_and_b32 s4, s0, 0xffff -; GFX89-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX89-NEXT: s_lshl_b32 s3, s3, 8 -; GFX89-NEXT: s_lshl_b32 s5, s5, 8 -; GFX89-NEXT: s_or_b32 s1, s1, s3 -; GFX89-NEXT: s_or_b32 s0, s0, s5 -; GFX89-NEXT: s_lshl_b32 s1, s1, 16 -; GFX89-NEXT: s_lshl_b32 s0, s0, 16 -; GFX89-NEXT: s_or_b32 s1, s2, s1 -; GFX89-NEXT: s_or_b32 s0, s4, s0 -; GFX89-NEXT: v_mov_b32_e32 v0, s1 -; GFX89-NEXT: v_mov_b32_e32 v1, s0 ; GFX89-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX89-NEXT: s_endpgm ; -; GFX9-LABEL: pulled_out_test: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_lshr_b32 s5, s1, 24 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: s_and_b32 s4, s1, 0xffff -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX9-NEXT: s_and_b32 s6, s0, 0xffff -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_or_b32 s1, s1, s5 -; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s1, s4, s1 -; GFX9-NEXT: s_or_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: pulled_out_test: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_and_b32 s4, s0, 0xffff -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_and_b32 s6, s1, 0xffff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_or_b32 s0, s0, s5 -; GFX10-NEXT: s_or_b32 s1, s1, s7 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_or_b32 s1, s6, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] -; GFX10-NEXT: s_endpgm +; GFX9_10-LABEL: pulled_out_test: +; GFX9_10: ; %bb.0: ; %entry +; GFX9_10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9_10-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_10-NEXT: s_waitcnt lgkmcnt(0) +; GFX9_10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9_10-NEXT: s_waitcnt vmcnt(0) +; GFX9_10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9_10-NEXT: s_endpgm entry: %idxprom = ashr exact i64 15, 32 %arrayidx = getelementptr inbounds <8 x i8>, ptr addrspace(1) %sourceA, i64 %idxprom @@ -2297,5 +2215,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} -; GFX9_10: {{.*}} ; SDWA: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 5aafb0f576fb4..fc990283b004c 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: KILL undef %125:sgpr_128 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %119:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: KILL undef %119:sgpr_128 ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc @@ -44,37 +44,38 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4) - ; CHECK-NEXT: KILL undef %74:sreg_64 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.78, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %68:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1 + ; CHECK-NEXT: KILL undef %68:sreg_64 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %83:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL undef %89:sgpr_128 - ; CHECK-NEXT: KILL undef %118:sgpr_128 + ; CHECK-NEXT: KILL undef %83:sgpr_128 + ; CHECK-NEXT: KILL undef %112:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.84, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.90, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1 + ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %163:sreg_32, 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %163:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.96, addrspace 4) ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %163:sreg_32, implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc @@ -87,20 +88,21 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %296:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, undef %352:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %362:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.104, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %346:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc @@ -114,19 +116,19 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) + ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %378:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.131, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.142, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.147, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.159, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.137, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.154, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc @@ -138,50 +140,47 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.270, align 8, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1 ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.279, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4) - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc - ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 - ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]] - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub3 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) @@ -191,33 +190,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc - ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 - ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.287, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub2:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub3:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub3 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM2]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.253, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %467:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: KILL undef %467:sreg_64 + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX2_IMM2]].sub0_sub1_sub2, [[S_LOAD_DWORDX2_IMM2]].sub3 ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %470:sreg_64 - ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.262, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.296, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] - ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc @@ -226,20 +224,20 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.314, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.320, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.326, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] - ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] + ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -267,9 +265,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec ; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_14]], [[V_SUBREV_U32_e64_9]], implicit $exec ; CHECK-NEXT: [[V_SUBREV_U32_e64_11:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 52, [[BUFFER_LOAD_FORMAT_X_IDXEN10]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_15]], [[V_SUBREV_U32_e64_10]], implicit $exec @@ -351,13 +349,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %540:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %554:vgpr_32, undef %556:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index c554fdbf4c799..9aba7e030ec19 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -207,6 +207,7 @@ ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index 62bb02d9b3c40..3ed2950fa9ce2 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -133,6 +133,7 @@ ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index 0da7a9f73bdce..bc04d1b35bb82 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -118,6 +118,7 @@ ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index 38b7890682783..772b3cc4a5fb6 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -127,6 +127,7 @@ ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll index 5aacd26def2be..84d62d56c3a9b 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll @@ -165,6 +165,7 @@ ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll index f6a9406596803..6e7b973213494 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -167,6 +167,7 @@ ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll index 48a9433d24999..10e3f7c6e1fb6 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -131,6 +131,7 @@ ; CHECK-O-NEXT: Running pass: SCCPPass ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis diff --git a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll new file mode 100644 index 0000000000000..e7b6a6bc66fa1 --- /dev/null +++ b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll @@ -0,0 +1,601 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE + +;; u0xff00ff00 = -16711936 +;; u0x00ff00ff = 16711935 +define i32 @add.0(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @add.0( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 +; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]] +; LAZY-NEXT: ret i32 [[ADD]] +; +; AGGRESSIVE-LABEL: define i32 @add.0( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935 +; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936 +; AGGRESSIVE-NEXT: [[ADD_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]] +; AGGRESSIVE-NEXT: ret i32 [[ADD_MERGE]] +; + %a.mask = and i32 %a, u0xff00ff00 + %b.mask = and i32 %b, u0x00ff00ff + %add = add i32 %a.mask, %b.mask + ret i32 %add +} + +;; u0xff00ffff = -16711681 +;; u0x00ff00ff = 16711935 +;; Nothing happens in this case because of the overlapping bytes. +define i32 @add.1(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @add.1( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 +; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]] +; LAZY-NEXT: ret i32 [[ADD]] +; +; AGGRESSIVE-LABEL: define i32 @add.1( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681 +; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935 +; AGGRESSIVE-NEXT: [[ADD:%.*]] = add i32 [[A_MASK2]], [[B_MASK4]] +; AGGRESSIVE-NEXT: ret i32 [[ADD]] +; + %a.mask = and i32 %a, u0xff00ffff + %b.mask = and i32 %b, u0x00ff00ff + %add = add i32 %a.mask, %b.mask + ret i32 %add +} + +define i32 @and.0(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @and.0( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: ret i32 0 +; +; AGGRESSIVE-LABEL: define i32 @and.0( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: ret i32 0 +; + %a.mask = and i32 %a, u0xff00ff00 + %b.mask = and i32 %b, u0x00ff00ff + %and = and i32 %a.mask, %b.mask + ret i32 %and +} + +;; u0xff00ff00 = -16711936 +;; u0x00ff00ff = 16711935 +define i32 @and.1(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @and.1( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 +; LAZY-NEXT: [[AND:%.*]] = and i32 [[A_MASK]], [[B_MASK]] +; LAZY-NEXT: ret i32 [[AND]] +; +; AGGRESSIVE-LABEL: define i32 @and.1( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681 +; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935 +; AGGRESSIVE-NEXT: [[AND:%.*]] = and i32 [[A_MASK2]], [[B_MASK4]] +; AGGRESSIVE-NEXT: ret i32 [[AND]] +; + %a.mask = and i32 %a, u0xff00ffff + %b.mask = and i32 %b, u0x00ff00ff + %and = and i32 %a.mask, %b.mask + ret i32 %and +} + +define i32 @and.2(i32 %x) { +; LAZY-LABEL: define i32 @and.2( +; LAZY-SAME: i32 [[X:%.*]]) { +; LAZY-NEXT: [[X_0:%.*]] = and i32 [[X]], -16711681 +; LAZY-NEXT: [[X_1:%.*]] = and i32 [[X]], 16711935 +; LAZY-NEXT: [[AND:%.*]] = and i32 [[X_0]], [[X_1]] +; LAZY-NEXT: ret i32 [[AND]] +; +; AGGRESSIVE-LABEL: define i32 @and.2( +; AGGRESSIVE-SAME: i32 [[X:%.*]]) { +; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], 255 +; AGGRESSIVE-NEXT: ret i32 [[X_MASK]] +; + %x.0 = and i32 %x, u0xff00ffff + %x.1 = and i32 %x, u0x00ff00ff + %and = and i32 %x.0, %x.1 + ret i32 %and +} + +;; u0xff00ff00 = -16711936 +;; u0x00ff00ff = 16711935 +define i32 @or.0(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @or.0( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 +; LAZY-NEXT: [[OR:%.*]] = or i32 [[A_MASK]], [[B_MASK]] +; LAZY-NEXT: ret i32 [[OR]] +; +; AGGRESSIVE-LABEL: define i32 @or.0( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935 +; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936 +; AGGRESSIVE-NEXT: [[OR_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]] +; AGGRESSIVE-NEXT: ret i32 [[OR_MERGE]] +; + %a.mask = and i32 %a, u0xff00ff00 + %b.mask = and i32 %b, u0x00ff00ff + %or = or i32 %a.mask, %b.mask + ret i32 %or +} + +;; u0xff00ffff = -16711681 +;; u0x00ff00ff = 16711935 +;; Nothing happens in this case because of the overlapping bytes. +define i32 @or.1(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @or.1( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 +; LAZY-NEXT: [[OR:%.*]] = or i32 [[A_MASK]], [[B_MASK]] +; LAZY-NEXT: ret i32 [[OR]] +; +; AGGRESSIVE-LABEL: define i32 @or.1( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681 +; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935 +; AGGRESSIVE-NEXT: [[OR:%.*]] = or i32 [[A_MASK2]], [[B_MASK4]] +; AGGRESSIVE-NEXT: ret i32 [[OR]] +; + %a.mask = and i32 %a, u0xff00ffff + %b.mask = and i32 %b, u0x00ff00ff + %or = or i32 %a.mask, %b.mask + ret i32 %or +} + +define i32 @or.2(i32 %x) { +; LAZY-LABEL: define i32 @or.2( +; LAZY-SAME: i32 [[X:%.*]]) { +; LAZY-NEXT: ret i32 [[X]] +; +; AGGRESSIVE-LABEL: define i32 @or.2( +; AGGRESSIVE-SAME: i32 [[X:%.*]]) { +; AGGRESSIVE-NEXT: ret i32 [[X]] +; + %x.0 = and i32 %x, u0xff00ffff + %x.1 = and i32 %x, u0x00ff00ff + %or = or i32 %x.0, %x.1 + ret i32 %or +} + +;; u0xff00ff00 = -16711936 +;; u0x00ff00ff = 16711935 +define i32 @or.3(i32 %x) { +; LAZY-LABEL: define i32 @or.3( +; LAZY-SAME: i32 [[X:%.*]]) { +; LAZY-NEXT: [[X_0:%.*]] = or i32 [[X]], 16711935 +; LAZY-NEXT: ret i32 [[X_0]] +; +; AGGRESSIVE-LABEL: define i32 @or.3( +; AGGRESSIVE-SAME: i32 [[X:%.*]]) { +; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -16711936 +; AGGRESSIVE-NEXT: [[X_0_MERGE:%.*]] = or disjoint i32 [[X_MASK]], 16711935 +; AGGRESSIVE-NEXT: ret i32 [[X_0_MERGE]] +; + %x.0 = or i32 %x, u0x00ff00ff + ret i32 %x.0 +} + +;; u0xff00ff00 = -16711936 +;; u0x00ff00ff = 16711935 +define i32 @xor.0(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @xor.0( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 +; LAZY-NEXT: [[XOR:%.*]] = xor i32 [[A_MASK]], [[B_MASK]] +; LAZY-NEXT: ret i32 [[XOR]] +; +; AGGRESSIVE-LABEL: define i32 @xor.0( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935 +; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936 +; AGGRESSIVE-NEXT: [[XOR_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]] +; AGGRESSIVE-NEXT: ret i32 [[XOR_MERGE]] +; + %a.mask = and i32 %a, u0xff00ff00 + %b.mask = and i32 %b, u0x00ff00ff + %xor = xor i32 %a.mask, %b.mask + ret i32 %xor +} + +;; u0xff00ffff = -16711681 +;; u0x00ff00ff = 16711935 +;; Nothing happens in this case because of the overlapping bytes. +define i32 @xor.1(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @xor.1( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 +; LAZY-NEXT: [[XOR:%.*]] = xor i32 [[A_MASK]], [[B_MASK]] +; LAZY-NEXT: ret i32 [[XOR]] +; +; AGGRESSIVE-LABEL: define i32 @xor.1( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681 +; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935 +; AGGRESSIVE-NEXT: [[XOR:%.*]] = xor i32 [[A_MASK2]], [[B_MASK4]] +; AGGRESSIVE-NEXT: ret i32 [[XOR]] +; + %a.mask = and i32 %a, u0xff00ffff + %b.mask = and i32 %b, u0x00ff00ff + %xor = xor i32 %a.mask, %b.mask + ret i32 %xor +} + +define i32 @xor.2(i32 %x) { +; LAZY-LABEL: define i32 @xor.2( +; LAZY-SAME: i32 [[X:%.*]]) { +; LAZY-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -256 +; LAZY-NEXT: ret i32 [[X_MASK]] +; +; AGGRESSIVE-LABEL: define i32 @xor.2( +; AGGRESSIVE-SAME: i32 [[X:%.*]]) { +; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -256 +; AGGRESSIVE-NEXT: ret i32 [[X_MASK]] +; + %x.0 = and i32 %x, u0xff00ffff + %x.1 = and i32 %x, u0x00ff00ff + %xor = xor i32 %x.0, %x.1 + ret i32 %xor +} + +define i32 @shl.0(i32 %base) { +; LAZY-LABEL: define i32 @shl.0( +; LAZY-SAME: i32 [[BASE:%.*]]) { +; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 16777215 +; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 8 +; LAZY-NEXT: [[POST:%.*]] = and i32 [[SHL]], -256 +; LAZY-NEXT: ret i32 [[POST]] +; +; AGGRESSIVE-LABEL: define i32 @shl.0( +; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) { +; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = shl i32 [[BASE]], 8 +; AGGRESSIVE-NEXT: ret i32 [[BASE_SHIFT]] +; + %pre = and i32 %base, u0x00ffffff + %shl = shl i32 %pre, 8 + %post = and i32 %shl, u0xffffff00 + ret i32 %post +} + +;; u0x0000ff00 = 65280 +define i32 @shl.1(i32 %base) { +; LAZY-LABEL: define i32 @shl.1( +; LAZY-SAME: i32 [[BASE:%.*]]) { +; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], -16711936 +; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 8 +; LAZY-NEXT: ret i32 [[SHL]] +; +; AGGRESSIVE-LABEL: define i32 @shl.1( +; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) { +; AGGRESSIVE-NEXT: [[BASE_MASK:%.*]] = and i32 [[BASE]], 65280 +; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = shl i32 [[BASE_MASK]], 8 +; AGGRESSIVE-NEXT: ret i32 [[BASE_SHIFT]] +; + %pre = and i32 %base, u0xff00ff00 + %shl = shl i32 %pre, 8 + ret i32 %shl +} + +;; u0x0fffffff = 268435455 +;; Nothing happens because it is not byte-aligned. +define i32 @shl.2(i32 %base) { +; LAZY-LABEL: define i32 @shl.2( +; LAZY-SAME: i32 [[BASE:%.*]]) { +; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 268435455 +; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 4 +; LAZY-NEXT: [[POST:%.*]] = and i32 [[SHL]], -16 +; LAZY-NEXT: ret i32 [[POST]] +; +; AGGRESSIVE-LABEL: define i32 @shl.2( +; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) { +; AGGRESSIVE-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 268435455 +; AGGRESSIVE-NEXT: [[SHL:%.*]] = shl i32 [[PRE]], 4 +; AGGRESSIVE-NEXT: [[POST:%.*]] = and i32 [[SHL]], -16 +; AGGRESSIVE-NEXT: ret i32 [[POST]] +; + %pre = and i32 %base, u0x0fffffff + %shl = shl i32 %pre, 4 + %post = and i32 %shl, u0xfffffff0 + ret i32 %post +} + +define <8 x i8> @shl.vec(<2 x i32> %base) { +; LAZY-LABEL: define <8 x i8> @shl.vec( +; LAZY-SAME: <2 x i32> [[BASE:%.*]]) { +; LAZY-NEXT: [[PRE:%.*]] = and <2 x i32> [[BASE]], +; LAZY-NEXT: [[SHL:%.*]] = shl <2 x i32> [[PRE]], +; LAZY-NEXT: [[POST:%.*]] = and <2 x i32> [[SHL]], splat (i32 -256) +; LAZY-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[POST]] to <8 x i8> +; LAZY-NEXT: ret <8 x i8> [[CAST]] +; +; AGGRESSIVE-LABEL: define <8 x i8> @shl.vec( +; AGGRESSIVE-SAME: <2 x i32> [[BASE:%.*]]) { +; AGGRESSIVE-NEXT: [[BASE_CAST:%.*]] = bitcast <2 x i32> [[BASE]] to <8 x i8> +; AGGRESSIVE-NEXT: [[BASE_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: [[BASE_SHUFFLE2:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> [[BASE_SHUFFLE]], <8 x i32> +; AGGRESSIVE-NEXT: ret <8 x i8> [[BASE_SHUFFLE2]] +; + %pre = and <2 x i32> %base, + %shl = shl <2 x i32> %pre, + %post = and <2 x i32> %shl, splat(i32 u0xffffff00) + %cast = bitcast <2 x i32> %post to <8 x i8> + ret <8 x i8> %cast +} + +define i32 @lshr.0(i32 %base) { +; LAZY-LABEL: define i32 @lshr.0( +; LAZY-SAME: i32 [[BASE:%.*]]) { +; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], -256 +; LAZY-NEXT: [[LSHR:%.*]] = lshr i32 [[PRE]], 8 +; LAZY-NEXT: [[POST:%.*]] = and i32 [[LSHR]], 16777215 +; LAZY-NEXT: ret i32 [[POST]] +; +; AGGRESSIVE-LABEL: define i32 @lshr.0( +; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) { +; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = lshr i32 [[BASE]], 8 +; AGGRESSIVE-NEXT: ret i32 [[BASE_SHIFT]] +; + %pre = and i32 %base, u0xffffff00 + %lshr = lshr i32 %pre, 8 + %post = and i32 %lshr, u0x00ffffff + ret i32 %post +} + +;; u0x0000ff00 = 65280 +define i32 @lshr.1(i32 %base) { +; LAZY-LABEL: define i32 @lshr.1( +; LAZY-SAME: i32 [[BASE:%.*]]) { +; LAZY-NEXT: [[PRE:%.*]] = and i32 [[BASE]], 16711935 +; LAZY-NEXT: [[LSHR:%.*]] = lshr i32 [[PRE]], 8 +; LAZY-NEXT: ret i32 [[LSHR]] +; +; AGGRESSIVE-LABEL: define i32 @lshr.1( +; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) { +; AGGRESSIVE-NEXT: [[BASE_SHIFT:%.*]] = lshr i32 [[BASE]], 8 +; AGGRESSIVE-NEXT: [[BASE_MASK:%.*]] = and i32 [[BASE_SHIFT]], 65280 +; AGGRESSIVE-NEXT: ret i32 [[BASE_MASK]] +; + %pre = and i32 %base, u0x00ff00ff + %lshr = lshr i32 %pre, 8 + ret i32 %lshr +} + +define <8 x i8> @lshr.vec(<2 x i32> %base) { +; LAZY-LABEL: define <8 x i8> @lshr.vec( +; LAZY-SAME: <2 x i32> [[BASE:%.*]]) { +; LAZY-NEXT: [[PRE:%.*]] = and <2 x i32> [[BASE]], +; LAZY-NEXT: [[LSHR:%.*]] = lshr <2 x i32> [[PRE]], +; LAZY-NEXT: [[POST:%.*]] = and <2 x i32> [[LSHR]], splat (i32 16777215) +; LAZY-NEXT: [[CAST:%.*]] = bitcast <2 x i32> [[POST]] to <8 x i8> +; LAZY-NEXT: ret <8 x i8> [[CAST]] +; +; AGGRESSIVE-LABEL: define <8 x i8> @lshr.vec( +; AGGRESSIVE-SAME: <2 x i32> [[BASE:%.*]]) { +; AGGRESSIVE-NEXT: [[BASE_CAST:%.*]] = bitcast <2 x i32> [[BASE]] to <8 x i8> +; AGGRESSIVE-NEXT: [[BASE_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: [[BASE_SHUFFLE2:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> [[BASE_SHUFFLE]], <8 x i32> +; AGGRESSIVE-NEXT: ret <8 x i8> [[BASE_SHUFFLE2]] +; + %pre = and <2 x i32> %base, + %lshr = lshr <2 x i32> %pre, + %post = and <2 x i32> %lshr, splat(i32 u0x00ffffff) + %cast = bitcast <2 x i32> %post to <8 x i8> + ret <8 x i8> %cast +} + +define i32 @trunc.0(i64 %src) { +; LAZY-LABEL: define i32 @trunc.0( +; LAZY-SAME: i64 [[SRC:%.*]]) { +; LAZY-NEXT: [[MASK:%.*]] = and i64 [[SRC]], 4294967295 +; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[MASK]] to i32 +; LAZY-NEXT: ret i32 [[TRUNC]] +; +; AGGRESSIVE-LABEL: define i32 @trunc.0( +; AGGRESSIVE-SAME: i64 [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = trunc i64 [[SRC]] to i32 +; AGGRESSIVE-NEXT: ret i32 [[SRC_CAST]] +; + %mask = and i64 %src, u0x00000000ffffffff + %trunc = trunc i64 %mask to i32 + ret i32 %trunc +} + +;; u0xff00ff00 = -16711936 +define i32 @trunc.1(i64 %src) { +; LAZY-LABEL: define i32 @trunc.1( +; LAZY-SAME: i64 [[SRC:%.*]]) { +; LAZY-NEXT: [[MASK:%.*]] = and i64 [[SRC]], -71777214294589696 +; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[MASK]] to i32 +; LAZY-NEXT: ret i32 [[TRUNC]] +; +; AGGRESSIVE-LABEL: define i32 @trunc.1( +; AGGRESSIVE-SAME: i64 [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = trunc i64 [[SRC]] to i32 +; AGGRESSIVE-NEXT: [[SRC_MASK:%.*]] = and i32 [[SRC_CAST]], -16711936 +; AGGRESSIVE-NEXT: ret i32 [[SRC_MASK]] +; + %mask = and i64 %src, u0xff00ff00ff00ff00 + %trunc = trunc i64 %mask to i32 + ret i32 %trunc +} + +define <4 x i8> @trunc.vec(<2 x i32> %src) { +; LAZY-LABEL: define <4 x i8> @trunc.vec( +; LAZY-SAME: <2 x i32> [[SRC:%.*]]) { +; LAZY-NEXT: [[TRUNC:%.*]] = trunc <2 x i32> [[SRC]] to <2 x i16> +; LAZY-NEXT: [[CAST:%.*]] = bitcast <2 x i16> [[TRUNC]] to <4 x i8> +; LAZY-NEXT: ret <4 x i8> [[CAST]] +; +; AGGRESSIVE-LABEL: define <4 x i8> @trunc.vec( +; AGGRESSIVE-SAME: <2 x i32> [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8> +; AGGRESSIVE-NEXT: [[SRC_EXTRACT:%.*]] = shufflevector <8 x i8> [[SRC_CAST]], <8 x i8> poison, <4 x i32> +; AGGRESSIVE-NEXT: [[SRC_SHUFFLE:%.*]] = shufflevector <4 x i8> [[SRC_EXTRACT]], <4 x i8> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[SRC_EXTRACT2:%.*]] = shufflevector <8 x i8> [[SRC_CAST]], <8 x i8> poison, <4 x i32> +; AGGRESSIVE-NEXT: [[SRC_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[SRC_EXTRACT2]], <4 x i8> [[SRC_SHUFFLE]], <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i8> [[SRC_SHUFFLE4]] +; + %trunc = trunc <2 x i32> %src to <2 x i16> + %cast = bitcast <2 x i16> %trunc to <4 x i8> + ret <4 x i8> %cast +} + +define i32 @bitcast.0(i32 %x) { +; LAZY-LABEL: define i32 @bitcast.0( +; LAZY-SAME: i32 [[X:%.*]]) { +; LAZY-NEXT: [[CAST:%.*]] = bitcast i32 [[X]] to <4 x i8> +; LAZY-NEXT: [[BACK:%.*]] = bitcast <4 x i8> [[CAST]] to i32 +; LAZY-NEXT: ret i32 [[BACK]] +; +; AGGRESSIVE-LABEL: define i32 @bitcast.0( +; AGGRESSIVE-SAME: i32 [[X:%.*]]) { +; AGGRESSIVE-NEXT: ret i32 [[X]] +; + %cast = bitcast i32 %x to <4 x i8> + %back = bitcast <4 x i8> %cast to i32 + ret i32 %back +} + +define i8 @extractelement.0(i32 %src) { +; LAZY-LABEL: define i8 @extractelement.0( +; LAZY-SAME: i32 [[SRC:%.*]]) { +; LAZY-NEXT: [[CAST:%.*]] = bitcast i32 [[SRC]] to <4 x i8> +; LAZY-NEXT: [[ELT:%.*]] = extractelement <4 x i8> [[CAST]], i64 3 +; LAZY-NEXT: ret i8 [[ELT]] +; +; AGGRESSIVE-LABEL: define i8 @extractelement.0( +; AGGRESSIVE-SAME: i32 [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_SHIFT:%.*]] = lshr i32 [[SRC]], 24 +; AGGRESSIVE-NEXT: [[SRC_SHIFT_CAST:%.*]] = trunc i32 [[SRC_SHIFT]] to i8 +; AGGRESSIVE-NEXT: ret i8 [[SRC_SHIFT_CAST]] +; + %cast = bitcast i32 %src to <4 x i8> + %elt = extractelement <4 x i8> %cast, i64 3 + ret i8 %elt +} + +define i32 @insertelement.0(i8 %src) { +; LAZY-LABEL: define i32 @insertelement.0( +; LAZY-SAME: i8 [[SRC:%.*]]) { +; LAZY-NEXT: [[INSERT:%.*]] = insertelement <4 x i8> zeroinitializer, i8 [[SRC]], i64 3 +; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[INSERT]] to i32 +; LAZY-NEXT: ret i32 [[CAST]] +; +; AGGRESSIVE-LABEL: define i32 @insertelement.0( +; AGGRESSIVE-SAME: i8 [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = zext i8 [[SRC]] to i32 +; AGGRESSIVE-NEXT: [[SRC_SHIFT:%.*]] = shl i32 [[SRC_CAST]], 24 +; AGGRESSIVE-NEXT: ret i32 [[SRC_SHIFT]] +; + %insert = insertelement <4 x i8> zeroinitializer, i8 %src, i64 3 + %cast = bitcast <4 x i8> %insert to i32 + ret i32 %cast +} + +define i32 @insertelement.1(i8 %a, i8 %b) { +; LAZY-LABEL: define i32 @insertelement.1( +; LAZY-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) { +; LAZY-NEXT: [[INSERT_A:%.*]] = insertelement <4 x i8> zeroinitializer, i8 [[A]], i64 3 +; LAZY-NEXT: [[INSERT_B:%.*]] = insertelement <4 x i8> [[INSERT_A]], i8 [[B]], i64 1 +; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[INSERT_B]] to i32 +; LAZY-NEXT: ret i32 [[CAST]] +; +; AGGRESSIVE-LABEL: define i32 @insertelement.1( +; AGGRESSIVE-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = zext i8 [[A]] to i32 +; AGGRESSIVE-NEXT: [[B_CAST:%.*]] = zext i8 [[B]] to i32 +; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = shl i32 [[B_CAST]], 8 +; AGGRESSIVE-NEXT: [[A_SHIFT:%.*]] = shl i32 [[A_CAST]], 24 +; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i32 [[B_SHIFT]], [[A_SHIFT]] +; AGGRESSIVE-NEXT: ret i32 [[CAST_MERGE]] +; + %insert.a = insertelement <4 x i8> zeroinitializer, i8 %a, i64 3 + %insert.b = insertelement <4 x i8> %insert.a, i8 %b, i64 1 + %cast = bitcast <4 x i8> %insert.b to i32 + ret i32 %cast +} + +define i64 @shufflevector.0(i32 %a, i32 %b) { +; LAZY-LABEL: define i64 @shufflevector.0( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8> +; LAZY-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8> +; LAZY-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> [[B_CAST]], <8 x i32> +; LAZY-NEXT: [[CAST:%.*]] = bitcast <8 x i8> [[SHUFFLE]] to i64 +; LAZY-NEXT: ret i64 [[CAST]] +; +; AGGRESSIVE-LABEL: define i64 @shufflevector.0( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[B_CAST2:%.*]] = zext i32 [[B]] to i64 +; AGGRESSIVE-NEXT: [[A_CAST1:%.*]] = zext i32 [[A]] to i64 +; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = shl i64 [[B_CAST2]], 32 +; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i64 [[A_CAST1]], [[B_SHIFT]] +; AGGRESSIVE-NEXT: ret i64 [[CAST_MERGE]] +; + %a.cast = bitcast i32 %a to <4 x i8> + %b.cast = bitcast i32 %b to <4 x i8> + %shuffle = shufflevector <4 x i8> %a.cast, <4 x i8> %b.cast, <8 x i32> + %cast = bitcast <8 x i8> %shuffle to i64 + ret i64 %cast +} + +define i32 @shufflevector.1(i32 %a, i32 %b) { +; LAZY-LABEL: define i32 @shufflevector.1( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8> +; LAZY-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8> +; LAZY-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> [[B_CAST]], <4 x i32> +; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[SHUFFLE]] to i32 +; LAZY-NEXT: ret i32 [[CAST]] +; +; AGGRESSIVE-LABEL: define i32 @shufflevector.1( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_MASK:%.*]] = and i32 [[A]], 16711935 +; AGGRESSIVE-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -16711936 +; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i32 [[A_MASK]], [[B_MASK]] +; AGGRESSIVE-NEXT: ret i32 [[CAST_MERGE]] +; + %a.cast = bitcast i32 %a to <4 x i8> + %b.cast = bitcast i32 %b to <4 x i8> + %shuffle = shufflevector <4 x i8> %a.cast, <4 x i8> %b.cast, <4 x i32> + %cast = bitcast <4 x i8> %shuffle to i32 + ret i32 %cast +} + +define i32 @shufflevector.2(i32 %a, i64 %b) { +; LAZY-LABEL: define i32 @shufflevector.2( +; LAZY-SAME: i32 [[A:%.*]], i64 [[B:%.*]]) { +; LAZY-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8> +; LAZY-NEXT: [[B_CAST:%.*]] = bitcast i64 [[B]] to <8 x i8> +; LAZY-NEXT: [[SHUFFLE_0:%.*]] = shufflevector <8 x i8> [[B_CAST]], <8 x i8> poison, <4 x i32> +; LAZY-NEXT: [[SHUFFLE_1:%.*]] = shufflevector <4 x i8> [[SHUFFLE_0]], <4 x i8> [[A_CAST]], <4 x i32> +; LAZY-NEXT: [[CAST:%.*]] = bitcast <4 x i8> [[SHUFFLE_1]] to i32 +; LAZY-NEXT: ret i32 [[CAST]] +; +; AGGRESSIVE-LABEL: define i32 @shufflevector.2( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i64 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = lshr i64 [[B]], 48 +; AGGRESSIVE-NEXT: [[B_SHIFT_CAST:%.*]] = trunc i64 [[B_SHIFT]] to i32 +; AGGRESSIVE-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -65536 +; AGGRESSIVE-NEXT: [[CAST_MERGE:%.*]] = or disjoint i32 [[B_SHIFT_CAST]], [[A_MASK]] +; AGGRESSIVE-NEXT: ret i32 [[CAST_MERGE]] +; + %a.cast = bitcast i32 %a to <4 x i8> + %b.cast = bitcast i64 %b to <8 x i8> + %shuffle.0 = shufflevector <8 x i8> %b.cast, <8 x i8> poison, <4 x i32> + %shuffle.1 = shufflevector <4 x i8> %shuffle.0, <4 x i8> %a.cast, <4 x i32> + %cast = bitcast <4 x i8> %shuffle.1 to i32 + ret i32 %cast +} diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll new file mode 100644 index 0000000000000..fe08ce93719d0 --- /dev/null +++ b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE + +define i16 @top_bytes(i32 %a, i32 %b) { +; LAZY-LABEL: define i16 @top_bytes( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16777216 +; LAZY-NEXT: [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 16 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -16777216 +; LAZY-NEXT: [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 24 +; LAZY-NEXT: [[RES:%.*]] = or i32 [[A_LSHR]], [[B_LSHR]] +; LAZY-NEXT: [[TRUNC:%.*]] = trunc i32 [[RES]] to i16 +; LAZY-NEXT: ret i16 [[TRUNC]] +; +; AGGRESSIVE-LABEL: define i16 @top_bytes( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = lshr i32 [[B]], 24 +; AGGRESSIVE-NEXT: [[B_SHIFT_CAST:%.*]] = trunc i32 [[B_SHIFT]] to i16 +; AGGRESSIVE-NEXT: [[A_SHIFT:%.*]] = lshr i32 [[A]], 16 +; AGGRESSIVE-NEXT: [[A_SHIFT_CAST:%.*]] = trunc i32 [[A_SHIFT]] to i16 +; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i16 [[A_SHIFT_CAST]], -256 +; AGGRESSIVE-NEXT: [[TRUNC_MERGE:%.*]] = or disjoint i16 [[B_SHIFT_CAST]], [[A_MASK2]] +; AGGRESSIVE-NEXT: ret i16 [[TRUNC_MERGE]] +; + %a.mask = and i32 %a, u0xff000000 + %a.lshr = lshr i32 %a.mask, 16 + %b.mask = and i32 %b, u0xff000000 + %b.lshr = lshr i32 %b.mask, 24 + %res = or i32 %a.lshr, %b.lshr + %trunc = trunc i32 %res to i16 + ret i16 %trunc +} + +define i32 @bottom_bytes(i16 %a, i16 %b) { +; LAZY-LABEL: define i32 @bottom_bytes( +; LAZY-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i16 [[A]], 255 +; LAZY-NEXT: [[A_SHL:%.*]] = shl i16 [[A_MASK]], 8 +; LAZY-NEXT: [[B_MASK:%.*]] = and i16 [[B]], 255 +; LAZY-NEXT: [[RES:%.*]] = or i16 [[A_SHL]], [[B_MASK]] +; LAZY-NEXT: [[ZEXT:%.*]] = zext i16 [[RES]] to i32 +; LAZY-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], 16 +; LAZY-NEXT: ret i32 [[SHL]] +; +; AGGRESSIVE-LABEL: define i32 @bottom_bytes( +; AGGRESSIVE-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = zext i16 [[A]] to i32 +; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i16 [[B]], 255 +; AGGRESSIVE-NEXT: [[B_MASK_CAST:%.*]] = zext i16 [[B_MASK2]] to i32 +; AGGRESSIVE-NEXT: [[B_SHIFT:%.*]] = shl i32 [[B_MASK_CAST]], 16 +; AGGRESSIVE-NEXT: [[A_SHIFT:%.*]] = shl i32 [[A_CAST]], 24 +; AGGRESSIVE-NEXT: [[SHL_MERGE:%.*]] = or disjoint i32 [[B_SHIFT]], [[A_SHIFT]] +; AGGRESSIVE-NEXT: ret i32 [[SHL_MERGE]] +; + %a.mask = and i16 %a, u0x00ff + %a.shl = shl i16 %a.mask, 8 + %b.mask = and i16 %b, u0x00ff + %res = or i16 %a.shl, %b.mask + %zext = zext i16 %res to i32 + %shl = shl i32 %zext, 16 + ret i32 %shl +} + +define i32 @obtain_i32(i32 %from) { +; LAZY-LABEL: define i32 @obtain_i32( +; LAZY-SAME: i32 [[FROM:%.*]]) { +; LAZY-NEXT: ret i32 [[FROM]] +; +; AGGRESSIVE-LABEL: define i32 @obtain_i32( +; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: ret i32 [[FROM]] +; + %get.0 = and i32 %from, 255 + %shr.1 = lshr i32 %from, 8 + %mask.1 = and i32 %shr.1, 255 + %get.1 = shl i32 %mask.1, 8 + %out.1 = or i32 %get.0, %get.1 + + %shr.2 = lshr i32 %from, 16 + %mask.2 = and i32 %shr.2, 255 + %get.2 = shl i32 %mask.2, 16 + %shr.3 = lshr i32 %from, 24 + %mask.3 = and i32 %shr.3, 255 + %get.3 = shl i32 %mask.3, 24 + %out.2 = or i32 %get.2, %get.3 + + %out = or i32 %out.1, %out.2 + ret i32 %out +} + +;; u0xff00ffff = -16711681 +define i32 @obtain_i32_masked(i32 %from) { +; LAZY-LABEL: define i32 @obtain_i32_masked( +; LAZY-SAME: i32 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM]], -16711681 +; LAZY-NEXT: ret i32 [[FROM_MASK]] +; +; AGGRESSIVE-LABEL: define i32 @obtain_i32_masked( +; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM]], -16711681 +; AGGRESSIVE-NEXT: ret i32 [[FROM_MASK]] +; + %get.0 = and i32 %from, 255 + %shr.1 = lshr i32 %from, 8 + %mask.1 = and i32 %shr.1, 255 + %get.1 = shl i32 %mask.1, 8 + %out.1 = or i32 %get.0, %get.1 + + %shr.3 = lshr i32 %from, 24 + %mask.3 = and i32 %shr.3, 255 + %get.3 = shl i32 %mask.3, 24 + %out.2 = or i32 %out.1, %get.3 + + ret i32 %out.2 +} + +define i64 @obtain_i64(i64 %from) { +; LAZY-LABEL: define i64 @obtain_i64( +; LAZY-SAME: i64 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_MASK:%.*]] = and i64 [[FROM]], 4294967295 +; LAZY-NEXT: ret i64 [[FROM_MASK]] +; +; AGGRESSIVE-LABEL: define i64 @obtain_i64( +; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i64 [[FROM]], 4294967295 +; AGGRESSIVE-NEXT: ret i64 [[FROM_MASK]] +; + %mask.0 = and i64 %from, 255 + %get.0 = shl i64 %mask.0, 0 + %shr.1 = lshr i64 %from, 8 + %mask.1 = and i64 %shr.1, 255 + %get.1 = shl i64 %mask.1, 8 + %out.1 = or i64 %get.0, %get.1 + + %shr.2 = lshr i64 %from, 16 + %mask.2 = and i64 %shr.2, 255 + %get.2 = shl i64 %mask.2, 16 + %shr.3 = lshr i64 %from, 24 + %mask.3 = and i64 %shr.3, 255 + %get.3 = shl i64 %mask.3, 24 + %out.2 = or i64 %get.2, %get.3 + + %out = or i64 %out.1, %out.2 + ret i64 %out +} + +define i64 @obtain_i64_shifted(i64 %from) { +; LAZY-LABEL: define i64 @obtain_i64_shifted( +; LAZY-SAME: i64 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM]], 32 +; LAZY-NEXT: ret i64 [[FROM_SHIFT]] +; +; AGGRESSIVE-LABEL: define i64 @obtain_i64_shifted( +; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM]], 32 +; AGGRESSIVE-NEXT: ret i64 [[FROM_SHIFT]] +; + %mask.0 = and i64 %from, 255 + %get.0 = shl i64 %mask.0, 32 + %shr.1 = lshr i64 %from, 8 + %mask.1 = and i64 %shr.1, 255 + %get.1 = shl i64 %mask.1, 40 + %out.1 = or i64 %get.0, %get.1 + + %shr.2 = lshr i64 %from, 16 + %mask.2 = and i64 %shr.2, 255 + %get.2 = shl i64 %mask.2, 48 + %shr.3 = lshr i64 %from, 24 + %mask.3 = and i64 %shr.3, 255 + %get.3 = shl i64 %mask.3, 56 + %out.2 = or i64 %get.2, %get.3 + + %out = or i64 %out.1, %out.2 + ret i64 %out +} + +define i64 @obtain_i64_zext(i32 %from) { +; LAZY-LABEL: define i64 @obtain_i64_zext( +; LAZY-SAME: i32 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = zext i32 [[FROM]] to i64 +; LAZY-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM_CAST]], 32 +; LAZY-NEXT: ret i64 [[FROM_SHIFT]] +; +; AGGRESSIVE-LABEL: define i64 @obtain_i64_zext( +; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = zext i32 [[FROM]] to i64 +; AGGRESSIVE-NEXT: [[FROM_SHIFT:%.*]] = shl i64 [[FROM_CAST]], 32 +; AGGRESSIVE-NEXT: ret i64 [[FROM_SHIFT]] +; + %mask.0 = and i32 %from, 255 + %zext.0 = zext i32 %mask.0 to i64 + %get.0 = shl i64 %zext.0, 32 + %shr.1 = lshr i32 %from, 8 + %mask.1 = and i32 %shr.1, 255 + %zext.1 = zext i32 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 40 + %out.1 = or i64 %get.0, %get.1 + + %shr.2 = lshr i32 %from, 16 + %mask.2 = and i32 %shr.2, 255 + %zext.2 = zext i32 %mask.2 to i64 + %get.2 = shl i64 %zext.2, 48 + %shr.3 = lshr i32 %from, 24 + %mask.3 = and i32 %shr.3, 255 + %zext.3 = zext i32 %mask.3 to i64 + %get.3 = shl i64 %zext.3, 56 + %out.2 = or i64 %get.2, %get.3 + + %out = or i64 %out.1, %out.2 + ret i64 %out +} + +define i64 @combine(i32 %bot, i32 %top) { +; LAZY-LABEL: define i64 @combine( +; LAZY-SAME: i32 [[BOT:%.*]], i32 [[TOP:%.*]]) { +; LAZY-NEXT: [[TOP_CAST:%.*]] = zext i32 [[TOP]] to i64 +; LAZY-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64 +; LAZY-NEXT: [[TOP_SHIFT:%.*]] = shl i64 [[TOP_CAST]], 32 +; LAZY-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHIFT]] +; LAZY-NEXT: ret i64 [[OUT_3_MERGE]] +; +; AGGRESSIVE-LABEL: define i64 @combine( +; AGGRESSIVE-SAME: i32 [[BOT:%.*]], i32 [[TOP:%.*]]) { +; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = zext i32 [[TOP]] to i64 +; AGGRESSIVE-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64 +; AGGRESSIVE-NEXT: [[TOP_SHIFT:%.*]] = shl i64 [[TOP_CAST]], 32 +; AGGRESSIVE-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHIFT]] +; AGGRESSIVE-NEXT: ret i64 [[OUT_3_MERGE]] +; + %base = zext i32 %bot to i64 + + %mask.0 = and i32 %top, 255 + %zext.0 = zext i32 %mask.0 to i64 + %get.0 = shl i64 %zext.0, 32 + %out.0 = or i64 %base, %get.0 + + %shr.1 = lshr i32 %top, 8 + %mask.1 = and i32 %shr.1, 255 + %zext.1 = zext i32 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 40 + %out.1 = or i64 %out.0, %get.1 + + %shr.2 = lshr i32 %top, 16 + %mask.2 = and i32 %shr.2, 255 + %zext.2 = zext i32 %mask.2 to i64 + %get.2 = shl i64 %zext.2, 48 + %out.2 = or i64 %out.1, %get.2 + + %shr.3 = lshr i32 %top, 24 + %mask.3 = and i32 %shr.3, 255 + %zext.3 = zext i32 %mask.3 to i64 + %get.3 = shl i64 %zext.3, 56 + %out.3 = or i64 %out.2, %get.3 + + ret i64 %out.3 +} + +;; u0x0000ff00 = 65280 +;; u0x00ff0000 = 16711680 +;; u0xff000000 = -16777216 +;; u0xff0000ff = -16776961 +define i32 @shuffle_elts(i32 %x) { +; LAZY-LABEL: define i32 @shuffle_elts( +; LAZY-SAME: i32 [[X:%.*]]) { +; LAZY-NEXT: [[X_0:%.*]] = and i32 [[X]], 255 +; LAZY-NEXT: [[X_1:%.*]] = and i32 [[X]], 65280 +; LAZY-NEXT: [[X_2:%.*]] = and i32 [[X]], 16711680 +; LAZY-NEXT: [[X_3:%.*]] = and i32 [[X]], -16777216 +; LAZY-NEXT: [[SHL_1:%.*]] = shl i32 [[X_1]], 8 +; LAZY-NEXT: [[OUT_1:%.*]] = or i32 [[X_0]], [[SHL_1]] +; LAZY-NEXT: [[SHR_2:%.*]] = lshr i32 [[X_2]], 8 +; LAZY-NEXT: [[OUT_2:%.*]] = or i32 [[SHR_2]], [[X_3]] +; LAZY-NEXT: [[OUT:%.*]] = or i32 [[OUT_1]], [[OUT_2]] +; LAZY-NEXT: ret i32 [[OUT]] +; +; AGGRESSIVE-LABEL: define i32 @shuffle_elts( +; AGGRESSIVE-SAME: i32 [[X:%.*]]) { +; AGGRESSIVE-NEXT: [[X_MASK:%.*]] = and i32 [[X]], -16776961 +; AGGRESSIVE-NEXT: [[X_SHIFT:%.*]] = lshr i32 [[X]], 8 +; AGGRESSIVE-NEXT: [[X_MASK2:%.*]] = and i32 [[X_SHIFT]], 65280 +; AGGRESSIVE-NEXT: [[X_MASK4:%.*]] = and i32 [[X]], 65280 +; AGGRESSIVE-NEXT: [[X_SHIFT6:%.*]] = shl i32 [[X_MASK4]], 8 +; AGGRESSIVE-NEXT: [[OUT_MERGE:%.*]] = or disjoint i32 [[X_MASK]], [[X_MASK2]] +; AGGRESSIVE-NEXT: [[OUT_MERGE8:%.*]] = or disjoint i32 [[OUT_MERGE]], [[X_SHIFT6]] +; AGGRESSIVE-NEXT: ret i32 [[OUT_MERGE8]] +; + %x.0 = and i32 %x, u0x000000ff + %x.1 = and i32 %x, u0x0000ff00 + %x.2 = and i32 %x, u0x00ff0000 + %x.3 = and i32 %x, u0xff000000 + + %shl.1 = shl i32 %x.1, 8 + %out.1 = or i32 %x.0, %shl.1 + + %shr.2 = lshr i32 %x.2, 8 + %out.2 = or i32 %shr.2, %x.3 + + %out = or i32 %out.1, %out.2 + + ret i32 %out +} diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll new file mode 100644 index 0000000000000..e4c1538826e0f --- /dev/null +++ b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll @@ -0,0 +1,393 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE + +define <2 x i8> @top_bytes(i32 %a, i32 %b) { +; LAZY-LABEL: define <2 x i8> @top_bytes( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16777216 +; LAZY-NEXT: [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 24 +; LAZY-NEXT: [[A_TRUNC:%.*]] = trunc i32 [[A_LSHR]] to i8 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -16777216 +; LAZY-NEXT: [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 24 +; LAZY-NEXT: [[B_TRUNC:%.*]] = trunc i32 [[B_LSHR]] to i8 +; LAZY-NEXT: [[BYTES_0:%.*]] = insertelement <2 x i8> poison, i8 [[A_TRUNC]], i32 1 +; LAZY-NEXT: [[BYTES_1:%.*]] = insertelement <2 x i8> [[BYTES_0]], i8 [[B_TRUNC]], i32 0 +; LAZY-NEXT: ret <2 x i8> [[BYTES_1]] +; +; AGGRESSIVE-LABEL: define <2 x i8> @top_bytes( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8> +; AGGRESSIVE-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8> +; AGGRESSIVE-NEXT: [[B_EXTRACT:%.*]] = shufflevector <4 x i8> [[B_CAST]], <4 x i8> poison, <2 x i32> +; AGGRESSIVE-NEXT: [[B_SHUFFLE:%.*]] = shufflevector <2 x i8> [[B_EXTRACT]], <2 x i8> zeroinitializer, <2 x i32> +; AGGRESSIVE-NEXT: [[A_EXTRACT:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> poison, <2 x i32> +; AGGRESSIVE-NEXT: [[A_SHUFFLE:%.*]] = shufflevector <2 x i8> [[A_EXTRACT]], <2 x i8> [[B_SHUFFLE]], <2 x i32> +; AGGRESSIVE-NEXT: ret <2 x i8> [[A_SHUFFLE]] +; + %a.mask = and i32 %a, u0xff000000 + %a.lshr = lshr i32 %a.mask, 24 + %a.trunc = trunc i32 %a.lshr to i8 + %b.mask = and i32 %b, u0xff000000 + %b.lshr = lshr i32 %b.mask, 24 + %b.trunc = trunc i32 %b.lshr to i8 + %bytes.0 = insertelement <2 x i8> poison, i8 %a.trunc, i32 1 + %bytes.1 = insertelement <2 x i8> %bytes.0, i8 %b.trunc, i32 0 + ret <2 x i8> %bytes.1 +} + +define <2 x i16> @top_bytes.i16(i32 %a, i32 %b) { +; LAZY-LABEL: define <2 x i16> @top_bytes.i16( +; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -65536 +; LAZY-NEXT: [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 16 +; LAZY-NEXT: [[A_TRUNC:%.*]] = trunc i32 [[A_LSHR]] to i16 +; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], -65536 +; LAZY-NEXT: [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 16 +; LAZY-NEXT: [[B_TRUNC:%.*]] = trunc i32 [[B_LSHR]] to i16 +; LAZY-NEXT: [[BYTES_0:%.*]] = insertelement <2 x i16> poison, i16 [[A_TRUNC]], i32 1 +; LAZY-NEXT: [[BYTES_1:%.*]] = insertelement <2 x i16> [[BYTES_0]], i16 [[B_TRUNC]], i32 0 +; LAZY-NEXT: ret <2 x i16> [[BYTES_1]] +; +; AGGRESSIVE-LABEL: define <2 x i16> @top_bytes.i16( +; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; AGGRESSIVE-NEXT: [[A_CAST:%.*]] = bitcast i32 [[A]] to <2 x i16> +; AGGRESSIVE-NEXT: [[B_CAST:%.*]] = bitcast i32 [[B]] to <2 x i16> +; AGGRESSIVE-NEXT: [[B_SHUFFLE:%.*]] = shufflevector <2 x i16> [[B_CAST]], <2 x i16> zeroinitializer, <2 x i32> +; AGGRESSIVE-NEXT: [[A_SHUFFLE:%.*]] = shufflevector <2 x i16> [[A_CAST]], <2 x i16> [[B_SHUFFLE]], <2 x i32> +; AGGRESSIVE-NEXT: ret <2 x i16> [[A_SHUFFLE]] +; + %a.mask = and i32 %a, u0xffff0000 + %a.lshr = lshr i32 %a.mask, 16 + %a.trunc = trunc i32 %a.lshr to i16 + %b.mask = and i32 %b, u0xffff0000 + %b.lshr = lshr i32 %b.mask, 16 + %b.trunc = trunc i32 %b.lshr to i16 + %bytes.0 = insertelement <2 x i16> poison, i16 %a.trunc, i32 1 + %bytes.1 = insertelement <2 x i16> %bytes.0, i16 %b.trunc, i32 0 + ret <2 x i16> %bytes.1 +} + +define <4 x i8> @obtain_i32(i32 %from) { +; LAZY-LABEL: define <4 x i8> @obtain_i32( +; LAZY-SAME: i32 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8> +; LAZY-NEXT: ret <4 x i8> [[FROM_CAST]] +; +; AGGRESSIVE-LABEL: define <4 x i8> @obtain_i32( +; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8> +; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_CAST]] +; + %get.0 = trunc i32 %from to i8 + + %shr.1 = lshr i32 %from, 8 + %get.1 = trunc i32 %shr.1 to i8 + + %shr.2 = lshr i32 %from, 16 + %get.2 = trunc i32 %shr.2 to i8 + + %shr.3 = lshr i32 %from, 24 + %get.3 = trunc i32 %shr.3 to i8 + + %build.0 = insertelement <4 x i8> poison, i8 %get.0, i32 0 + %build.1 = insertelement <4 x i8> %build.0, i8 %get.1, i32 1 + %build.2 = insertelement <4 x i8> %build.1, i8 %get.2, i32 2 + %build.3 = insertelement <4 x i8> %build.2, i8 %get.3, i32 3 + ret <4 x i8> %build.3 +} + +define <2 x i16> @obtain_i32.i16(i32 %from) { +; LAZY-LABEL: define <2 x i16> @obtain_i32.i16( +; LAZY-SAME: i32 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16> +; LAZY-NEXT: ret <2 x i16> [[FROM_CAST]] +; +; AGGRESSIVE-LABEL: define <2 x i16> @obtain_i32.i16( +; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16> +; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_CAST]] +; + %get.0 = trunc i32 %from to i16 + + %shr.1 = lshr i32 %from, 16 + %get.1 = trunc i32 %shr.1 to i16 + + %build.0 = insertelement <2 x i16> poison, i16 %get.0, i32 0 + %build.1 = insertelement <2 x i16> %build.0, i16 %get.1, i32 1 + ret <2 x i16> %build.1 +} + +define <4 x i8> @obtain_i32_masked(i32 %from) { +; LAZY-LABEL: define <4 x i8> @obtain_i32_masked( +; LAZY-SAME: i32 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> +; LAZY-NEXT: ret <4 x i8> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <4 x i8> @obtain_i32_masked( +; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_SHUFFLE]] +; + %get.0 = trunc i32 %from to i8 + + %shr.1 = lshr i32 %from, 8 + %get.1 = trunc i32 %shr.1 to i8 + + %shr.3 = lshr i32 %from, 24 + %get.3 = trunc i32 %shr.3 to i8 + + %build.0 = insertelement <4 x i8> , i8 %get.0, i32 0 + %build.1 = insertelement <4 x i8> %build.0, i8 %get.1, i32 1 + %build.3 = insertelement <4 x i8> %build.1, i8 %get.3, i32 3 + ret <4 x i8> %build.3 +} + +define <2 x i16> @obtain_i32_masked.i16(i32 %from) { +; LAZY-LABEL: define <2 x i16> @obtain_i32_masked.i16( +; LAZY-SAME: i32 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16> +; LAZY-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], +; LAZY-NEXT: ret <2 x i16> [[FROM_MASK]] +; +; AGGRESSIVE-LABEL: define <2 x i16> @obtain_i32_masked.i16( +; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16> +; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], +; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_MASK]] +; + %get.0 = trunc i32 %from to i16 + + %shr.1 = lshr i32 %from, 16 + %trunc.1 = trunc i32 %shr.1 to i16 + %get.1 = and i16 %trunc.1, u0xff00 + + %build.0 = insertelement <2 x i16> poison, i16 %get.0, i32 0 + %build.1 = insertelement <2 x i16> %build.0, i16 %get.1, i32 1 + ret <2 x i16> %build.1 +} + +define <8 x i8> @obtain_i64(i64 %from) { +; LAZY-LABEL: define <8 x i8> @obtain_i64( +; LAZY-SAME: i64 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: ret <8 x i8> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <8 x i8> @obtain_i64( +; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: ret <8 x i8> [[FROM_SHUFFLE]] +; + %get.0 = trunc i64 %from to i8 + + %shr.1 = lshr i64 %from, 8 + %get.1 = trunc i64 %shr.1 to i8 + + %shr.2 = lshr i64 %from, 16 + %get.2 = trunc i64 %shr.2 to i8 + + %shr.3 = lshr i64 %from, 24 + %get.3 = trunc i64 %shr.3 to i8 + + %build.0 = insertelement <8 x i8> zeroinitializer, i8 %get.0, i32 0 + %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 1 + %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 2 + %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 3 + ret <8 x i8> %build.3 +} + +define <4 x i16> @obtain_i64.i16(i64 %from) { +; LAZY-LABEL: define <4 x i16> @obtain_i64.i16( +; LAZY-SAME: i64 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: ret <4 x i16> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <4 x i16> @obtain_i64.i16( +; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i16> [[FROM_SHUFFLE]] +; + %get.0 = trunc i64 %from to i16 + + %shr.1 = lshr i64 %from, 16 + %get.1 = trunc i64 %shr.1 to i16 + + %build.0 = insertelement <4 x i16> zeroinitializer, i16 %get.0, i32 0 + %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 1 + ret <4 x i16> %build.1 +} + +define <8 x i8> @obtain_i64_shifted(i64 %from) { +; LAZY-LABEL: define <8 x i8> @obtain_i64_shifted( +; LAZY-SAME: i64 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: ret <8 x i8> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <8 x i8> @obtain_i64_shifted( +; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: ret <8 x i8> [[FROM_SHUFFLE]] +; + %get.0 = trunc i64 %from to i8 + + %shr.1 = lshr i64 %from, 8 + %get.1 = trunc i64 %shr.1 to i8 + + %shr.2 = lshr i64 %from, 16 + %get.2 = trunc i64 %shr.2 to i8 + + %shr.3 = lshr i64 %from, 24 + %get.3 = trunc i64 %shr.3 to i8 + + %build.0 = insertelement <8 x i8> zeroinitializer, i8 %get.0, i32 4 + %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 5 + %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 6 + %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 7 + ret <8 x i8> %build.3 +} + +define <4 x i16> @obtain_i64_shifted.i16(i64 %from) { +; LAZY-LABEL: define <4 x i16> @obtain_i64_shifted.i16( +; LAZY-SAME: i64 [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: ret <4 x i16> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <4 x i16> @obtain_i64_shifted.i16( +; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i16> [[FROM_SHUFFLE]] +; + %get.0 = trunc i64 %from to i16 + + %shr.1 = lshr i64 %from, 16 + %get.1 = trunc i64 %shr.1 to i16 + + %build.0 = insertelement <4 x i16> zeroinitializer, i16 %get.0, i32 2 + %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 3 + ret <4 x i16> %build.1 +} + +define <8 x i8> @combine(<4 x i8> %bot, i32 %top) { +; LAZY-LABEL: define <8 x i8> @combine( +; LAZY-SAME: <4 x i8> [[BOT:%.*]], i32 [[TOP:%.*]]) { +; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <4 x i8> +; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> +; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> +; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> +; LAZY-NEXT: ret <8 x i8> [[TOP_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <8 x i8> @combine( +; AGGRESSIVE-SAME: <4 x i8> [[BOT:%.*]], i32 [[TOP:%.*]]) { +; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <4 x i8> +; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> +; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> +; AGGRESSIVE-NEXT: ret <8 x i8> [[TOP_SHUFFLE]] +; + %base = shufflevector <4 x i8> %bot, <4 x i8> poison, <8 x i32> + + %get.0 = trunc i32 %top to i8 + + %shr.1 = lshr i32 %top, 8 + %get.1 = trunc i32 %shr.1 to i8 + + %shr.2 = lshr i32 %top, 16 + %get.2 = trunc i32 %shr.2 to i8 + + %shr.3 = lshr i32 %top, 24 + %get.3 = trunc i32 %shr.3 to i8 + + %build.0 = insertelement <8 x i8> %base, i8 %get.0, i32 4 + %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 5 + %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 6 + %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 7 + + ret <8 x i8> %build.3 +} + +define <4 x i16> @combine.i16(<2 x i16> %bot, i32 %top) { +; LAZY-LABEL: define <4 x i16> @combine.i16( +; LAZY-SAME: <2 x i16> [[BOT:%.*]], i32 [[TOP:%.*]]) { +; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <2 x i16> +; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> +; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> +; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> +; LAZY-NEXT: ret <4 x i16> [[TOP_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <4 x i16> @combine.i16( +; AGGRESSIVE-SAME: <2 x i16> [[BOT:%.*]], i32 [[TOP:%.*]]) { +; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <2 x i16> +; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> +; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i16> [[TOP_SHUFFLE]] +; + %base = shufflevector <2 x i16> %bot, <2 x i16> poison, <4 x i32> + + %get.0 = trunc i32 %top to i16 + + %shr.1 = lshr i32 %top, 16 + %get.1 = trunc i32 %shr.1 to i16 + + %build.0 = insertelement <4 x i16> %base, i16 %get.0, i32 2 + %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 3 + + ret <4 x i16> %build.1 +} + +define <4 x i8> @shuffle_elts(i32 %x) { +; LAZY-LABEL: define <4 x i8> @shuffle_elts( +; LAZY-SAME: i32 [[X:%.*]]) { +; LAZY-NEXT: [[X_0:%.*]] = trunc i32 [[X]] to i8 +; LAZY-NEXT: [[SHR_1:%.*]] = lshr i32 [[X]], 8 +; LAZY-NEXT: [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8 +; LAZY-NEXT: [[SHR_2:%.*]] = lshr i32 [[X]], 16 +; LAZY-NEXT: [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8 +; LAZY-NEXT: [[SHR_3:%.*]] = lshr i32 [[X]], 24 +; LAZY-NEXT: [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8 +; LAZY-NEXT: [[BUILD_0:%.*]] = insertelement <4 x i8> poison, i8 [[X_0]], i32 0 +; LAZY-NEXT: [[BUILD_1:%.*]] = insertelement <4 x i8> [[BUILD_0]], i8 [[X_1]], i32 2 +; LAZY-NEXT: [[BUILD_2:%.*]] = insertelement <4 x i8> [[BUILD_1]], i8 [[X_2]], i32 1 +; LAZY-NEXT: [[X_SHUFFLE4:%.*]] = insertelement <4 x i8> [[BUILD_2]], i8 [[X_3]], i32 3 +; LAZY-NEXT: ret <4 x i8> [[X_SHUFFLE4]] +; +; AGGRESSIVE-LABEL: define <4 x i8> @shuffle_elts( +; AGGRESSIVE-SAME: i32 [[X:%.*]]) { +; AGGRESSIVE-NEXT: [[X_CAST:%.*]] = bitcast i32 [[X]] to <4 x i8> +; AGGRESSIVE-NEXT: [[X_SHUFFLE:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[X_SHUFFLE2:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> [[X_SHUFFLE]], <4 x i32> +; AGGRESSIVE-NEXT: [[X_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> [[X_SHUFFLE2]], <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i8> [[X_SHUFFLE4]] +; + %x.0 = trunc i32 %x to i8 + + %shr.1 = lshr i32 %x, 8 + %x.1 = trunc i32 %shr.1 to i8 + + %shr.2 = lshr i32 %x, 16 + %x.2 = trunc i32 %shr.2 to i8 + + %shr.3 = lshr i32 %x, 24 + %x.3 = trunc i32 %shr.3 to i8 + + %build.0 = insertelement <4 x i8> poison, i8 %x.0, i32 0 + %build.1 = insertelement <4 x i8> %build.0, i8 %x.1, i32 2 + %build.2 = insertelement <4 x i8> %build.1, i8 %x.2, i32 1 + %build.3 = insertelement <4 x i8> %build.2, i8 %x.3, i32 3 + + ret <4 x i8> %build.3 +} diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll new file mode 100644 index 0000000000000..22ee7fcf5b4c6 --- /dev/null +++ b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll @@ -0,0 +1,480 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE + +define i32 @extract_i32(<4 x i8> %from) { +; LAZY-LABEL: define i32 @extract_i32( +; LAZY-SAME: <4 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32 +; LAZY-NEXT: ret i32 [[FROM_CAST]] +; +; AGGRESSIVE-LABEL: define i32 @extract_i32( +; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32 +; AGGRESSIVE-NEXT: ret i32 [[FROM_CAST]] +; + %mask.0 = extractelement <4 x i8> %from, i64 0 + %get.0 = zext i8 %mask.0 to i32 + + %mask.1 = extractelement <4 x i8> %from, i64 1 + %zext.1 = zext i8 %mask.1 to i32 + %get.1 = shl i32 %zext.1, 8 + %out.1 = or i32 %get.0, %get.1 + + %mask.2 = extractelement <4 x i8> %from, i64 2 + %zext.2 = zext i8 %mask.2 to i32 + %get.2 = shl i32 %zext.2, 16 + + %mask.3 = extractelement <4 x i8> %from, i64 3 + %zext.3 = zext i8 %mask.3 to i32 + %get.3 = shl i32 %zext.3, 24 + %out.2 = or i32 %get.2, %get.3 + + %out = or i32 %out.1, %out.2 + ret i32 %out +} + +define i32 @extract_i32.i16(<2 x i16> %from) { +; LAZY-LABEL: define i32 @extract_i32.i16( +; LAZY-SAME: <2 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32 +; LAZY-NEXT: ret i32 [[FROM_CAST]] +; +; AGGRESSIVE-LABEL: define i32 @extract_i32.i16( +; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32 +; AGGRESSIVE-NEXT: ret i32 [[FROM_CAST]] +; + %mask.0 = extractelement <2 x i16> %from, i64 0 + %get.0 = zext i16 %mask.0 to i32 + + %mask.1 = extractelement <2 x i16> %from, i64 1 + %zext.1 = zext i16 %mask.1 to i32 + %get.1 = shl i32 %zext.1, 16 + + %out = or i32 %get.0, %get.1 + ret i32 %out +} + +define i32 @extract_i32_lower(<8 x i8> %from) { +; LAZY-LABEL: define i32 @extract_i32_lower( +; LAZY-SAME: <8 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM]], <8 x i8> zeroinitializer, <4 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i8> [[FROM_SHUFFLE]] to i32 +; LAZY-NEXT: ret i32 [[FROM_SHUFFLE_CAST]] +; +; AGGRESSIVE-LABEL: define i32 @extract_i32_lower( +; AGGRESSIVE-SAME: <8 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM]], <8 x i8> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i8> [[FROM_SHUFFLE]] to i32 +; AGGRESSIVE-NEXT: ret i32 [[FROM_SHUFFLE_CAST]] +; + %mask.0 = extractelement <8 x i8> %from, i64 4 + %get.0 = zext i8 %mask.0 to i32 + + %mask.1 = extractelement <8 x i8> %from, i64 5 + %zext.1 = zext i8 %mask.1 to i32 + %get.1 = shl i32 %zext.1, 8 + %out.1 = or i32 %get.0, %get.1 + + %mask.2 = extractelement <8 x i8> %from, i64 6 + %zext.2 = zext i8 %mask.2 to i32 + %get.2 = shl i32 %zext.2, 16 + + %mask.3 = extractelement <8 x i8> %from, i64 7 + %zext.3 = zext i8 %mask.3 to i32 + %get.3 = shl i32 %zext.3, 24 + %out.2 = or i32 %get.2, %get.3 + + %out = or i32 %out.1, %out.2 + ret i32 %out +} + +define i32 @extract_i32_lower.i16(<4 x i16> %from) { +; LAZY-LABEL: define i32 @extract_i32_lower.i16( +; LAZY-SAME: <4 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM]], <4 x i16> zeroinitializer, <2 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <2 x i16> [[FROM_SHUFFLE]] to i32 +; LAZY-NEXT: ret i32 [[FROM_SHUFFLE_CAST]] +; +; AGGRESSIVE-LABEL: define i32 @extract_i32_lower.i16( +; AGGRESSIVE-SAME: <4 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM]], <4 x i16> zeroinitializer, <2 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <2 x i16> [[FROM_SHUFFLE]] to i32 +; AGGRESSIVE-NEXT: ret i32 [[FROM_SHUFFLE_CAST]] +; + %mask.0 = extractelement <4 x i16> %from, i64 2 + %get.0 = zext i16 %mask.0 to i32 + + %mask.1 = extractelement <4 x i16> %from, i64 3 + %zext.1 = zext i16 %mask.1 to i32 + %get.1 = shl i32 %zext.1, 16 + + %out = or i32 %get.0, %get.1 + ret i32 %out +} + +;; u0xff00ffff = -16711681 +define i32 @extract_i32_masked(<4 x i8> %from) { +; LAZY-LABEL: define i32 @extract_i32_masked( +; LAZY-SAME: <4 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32 +; LAZY-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681 +; LAZY-NEXT: ret i32 [[FROM_MASK]] +; +; AGGRESSIVE-LABEL: define i32 @extract_i32_masked( +; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32 +; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681 +; AGGRESSIVE-NEXT: ret i32 [[FROM_MASK]] +; + %mask.0 = extractelement <4 x i8> %from, i64 0 + %get.0 = zext i8 %mask.0 to i32 + + %mask.1 = extractelement <4 x i8> %from, i64 1 + %zext.1 = zext i8 %mask.1 to i32 + %get.1 = shl i32 %zext.1, 8 + %out.1 = or i32 %get.0, %get.1 + + %mask.3 = extractelement <4 x i8> %from, i64 3 + %zext.3 = zext i8 %mask.3 to i32 + %get.3 = shl i32 %zext.3, 24 + %out.2 = or i32 %out.1, %get.3 + + ret i32 %out.2 +} + +;; u0xff00ffff = -16711681 +define i32 @extract_i32_masked.i16(<2 x i16> %from) { +; LAZY-LABEL: define i32 @extract_i32_masked.i16( +; LAZY-SAME: <2 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32 +; LAZY-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681 +; LAZY-NEXT: ret i32 [[FROM_MASK]] +; +; AGGRESSIVE-LABEL: define i32 @extract_i32_masked.i16( +; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32 +; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681 +; AGGRESSIVE-NEXT: ret i32 [[FROM_MASK]] +; + %mask.0 = extractelement <2 x i16> %from, i64 0 + %get.0 = zext i16 %mask.0 to i32 + + %mask.1 = extractelement <2 x i16> %from, i64 1 + %mask.1.1 = and i16 %mask.1, u0xff00 + %zext.1 = zext i16 %mask.1.1 to i32 + %get.1 = shl i32 %zext.1, 16 + + %out = or i32 %get.0, %get.1 + ret i32 %out +} + +define i64 @extract_i64(<4 x i8> %from) { +; LAZY-LABEL: define i64 @extract_i64( +; LAZY-SAME: <4 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64 +; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; +; AGGRESSIVE-LABEL: define i64 @extract_i64( +; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64 +; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; + %mask.0 = extractelement <4 x i8> %from, i64 0 + %get.0 = zext i8 %mask.0 to i64 + + %mask.1 = extractelement <4 x i8> %from, i64 1 + %zext.1 = zext i8 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 8 + %out.1 = or i64 %get.0, %get.1 + + %mask.2 = extractelement <4 x i8> %from, i64 2 + %zext.2 = zext i8 %mask.2 to i64 + %get.2 = shl i64 %zext.2, 16 + + %mask.3 = extractelement <4 x i8> %from, i64 3 + %zext.3 = zext i8 %mask.3 to i64 + %get.3 = shl i64 %zext.3, 24 + %out.2 = or i64 %get.2, %get.3 + + %out = or i64 %out.1, %out.2 + ret i64 %out +} + +define i64 @extract_i64.i16(<2 x i16> %from) { +; LAZY-LABEL: define i64 @extract_i64.i16( +; LAZY-SAME: <2 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64 +; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; +; AGGRESSIVE-LABEL: define i64 @extract_i64.i16( +; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64 +; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; + %mask.0 = extractelement <2 x i16> %from, i64 0 + %get.0 = zext i16 %mask.0 to i64 + + %mask.1 = extractelement <2 x i16> %from, i64 1 + %zext.1 = zext i16 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 16 + + %out = or i64 %get.0, %get.1 + ret i64 %out +} + +define i64 @extract_i64_shifted(<4 x i8> %from) { +; LAZY-LABEL: define i64 @extract_i64_shifted( +; LAZY-SAME: <4 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64 +; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; +; AGGRESSIVE-LABEL: define i64 @extract_i64_shifted( +; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64 +; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; + %mask.0 = extractelement <4 x i8> %from, i64 0 + %zext.0 = zext i8 %mask.0 to i64 + %get.0 = shl i64 %zext.0, 32 + + %mask.1 = extractelement <4 x i8> %from, i64 1 + %zext.1 = zext i8 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 40 + %out.1 = or i64 %get.0, %get.1 + + %mask.2 = extractelement <4 x i8> %from, i64 2 + %zext.2 = zext i8 %mask.2 to i64 + %get.2 = shl i64 %zext.2, 48 + + %mask.3 = extractelement <4 x i8> %from, i64 3 + %zext.3 = zext i8 %mask.3 to i64 + %get.3 = shl i64 %zext.3, 56 + %out.2 = or i64 %get.2, %get.3 + + %out = or i64 %out.1, %out.2 + ret i64 %out +} + +define i64 @extract_i64_shifted.i16(<2 x i16> %from) { +; LAZY-LABEL: define i64 @extract_i64_shifted.i16( +; LAZY-SAME: <2 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64 +; LAZY-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; +; AGGRESSIVE-LABEL: define i64 @extract_i64_shifted.i16( +; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64 +; AGGRESSIVE-NEXT: ret i64 [[FROM_SHUFFLE_CAST]] +; + %mask.0 = extractelement <2 x i16> %from, i64 0 + %zext.0 = zext i16 %mask.0 to i64 + %get.0 = shl i64 %zext.0, 32 + + %mask.1 = extractelement <2 x i16> %from, i64 1 + %zext.1 = zext i16 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 48 + + %out = or i64 %get.0, %get.1 + ret i64 %out +} + +define i64 @extract_combine(i32 %bot, <4 x i8> %top) { +; LAZY-LABEL: define i64 @extract_combine( +; LAZY-SAME: i32 [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) { +; LAZY-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64 +; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i8> [[TOP]], <4 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[TOP_SHUFFLE]] to i64 +; LAZY-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]] +; LAZY-NEXT: ret i64 [[OUT_3_MERGE]] +; +; AGGRESSIVE-LABEL: define i64 @extract_combine( +; AGGRESSIVE-SAME: i32 [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) { +; AGGRESSIVE-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64 +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i8> [[TOP]], <4 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[TOP_SHUFFLE]] to i64 +; AGGRESSIVE-NEXT: [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]] +; AGGRESSIVE-NEXT: ret i64 [[OUT_3_MERGE]] +; + %base = zext i32 %bot to i64 + + %mask.0 = extractelement <4 x i8> %top, i64 0 + %zext.0 = zext i8 %mask.0 to i64 + %get.0 = shl i64 %zext.0, 32 + %out.0 = or i64 %base, %get.0 + + %mask.1 = extractelement <4 x i8> %top, i64 1 + %zext.1 = zext i8 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 40 + %out.1 = or i64 %out.0, %get.1 + + %mask.2 = extractelement <4 x i8> %top, i64 2 + %zext.2 = zext i8 %mask.2 to i64 + %get.2 = shl i64 %zext.2, 48 + %out.2 = or i64 %out.1, %get.2 + + %mask.3 = extractelement <4 x i8> %top, i64 3 + %zext.3 = zext i8 %mask.3 to i64 + %get.3 = shl i64 %zext.3, 56 + %out.3 = or i64 %out.2, %get.3 + + ret i64 %out.3 +} + +define i64 @extract_combine.i16(i32 %bot, <2 x i16> %top) { +; LAZY-LABEL: define i64 @extract_combine.i16( +; LAZY-SAME: i32 [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) { +; LAZY-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64 +; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <2 x i16> [[TOP]], <2 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[TOP_SHUFFLE]] to i64 +; LAZY-NEXT: [[OUT_1_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]] +; LAZY-NEXT: ret i64 [[OUT_1_MERGE]] +; +; AGGRESSIVE-LABEL: define i64 @extract_combine.i16( +; AGGRESSIVE-SAME: i32 [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) { +; AGGRESSIVE-NEXT: [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64 +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <2 x i16> [[TOP]], <2 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[TOP_SHUFFLE]] to i64 +; AGGRESSIVE-NEXT: [[OUT_1_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]] +; AGGRESSIVE-NEXT: ret i64 [[OUT_1_MERGE]] +; + %base = zext i32 %bot to i64 + + %mask.0 = extractelement <2 x i16> %top, i64 0 + %zext.0 = zext i16 %mask.0 to i64 + %get.0 = shl i64 %zext.0, 32 + %out.0 = or i64 %base, %get.0 + + %mask.1 = extractelement <2 x i16> %top, i64 1 + %zext.1 = zext i16 %mask.1 to i64 + %get.1 = shl i64 %zext.1, 48 + %out.1 = or i64 %out.0, %get.1 + + ret i64 %out.1 +} + +define i32 @extract_bigelt.0(<4 x i64> %src) { +; LAZY-LABEL: define i32 @extract_bigelt.0( +; LAZY-SAME: <4 x i64> [[SRC:%.*]]) { +; LAZY-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3 +; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[SRC_0]] to i32 +; LAZY-NEXT: ret i32 [[TRUNC]] +; +; AGGRESSIVE-LABEL: define i32 @extract_bigelt.0( +; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = bitcast <4 x i64> [[SRC]] to <8 x i32> +; AGGRESSIVE-NEXT: [[SRC_EXTRACT:%.*]] = extractelement <8 x i32> [[SRC_CAST]], i32 6 +; AGGRESSIVE-NEXT: ret i32 [[SRC_EXTRACT]] +; + %src.0 = extractelement <4 x i64> %src, i64 3 + %trunc = trunc i64 %src.0 to i32 + ret i32 %trunc +} + +define i32 @extract_bigelt.1(<4 x i64> %src) { +; LAZY-LABEL: define i32 @extract_bigelt.1( +; LAZY-SAME: <4 x i64> [[SRC:%.*]]) { +; LAZY-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3 +; LAZY-NEXT: [[SHR:%.*]] = lshr i64 [[SRC_0]], 32 +; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32 +; LAZY-NEXT: ret i32 [[TRUNC]] +; +; AGGRESSIVE-LABEL: define i32 @extract_bigelt.1( +; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_CAST:%.*]] = bitcast <4 x i64> [[SRC]] to <8 x i32> +; AGGRESSIVE-NEXT: [[SRC_EXTRACT:%.*]] = extractelement <8 x i32> [[SRC_CAST]], i32 7 +; AGGRESSIVE-NEXT: ret i32 [[SRC_EXTRACT]] +; + %src.0 = extractelement <4 x i64> %src, i64 3 + %shr = lshr i64 %src.0, 32 + %trunc = trunc i64 %shr to i32 + ret i32 %trunc +} + +;; Nothing happens because the shift amount is too small. +define i32 @extract_bigelt.2(<4 x i64> %src) { +; LAZY-LABEL: define i32 @extract_bigelt.2( +; LAZY-SAME: <4 x i64> [[SRC:%.*]]) { +; LAZY-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3 +; LAZY-NEXT: [[SHR:%.*]] = lshr i64 [[SRC_0]], 16 +; LAZY-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32 +; LAZY-NEXT: ret i32 [[TRUNC]] +; +; AGGRESSIVE-LABEL: define i32 @extract_bigelt.2( +; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) { +; AGGRESSIVE-NEXT: [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3 +; AGGRESSIVE-NEXT: [[SHR:%.*]] = lshr i64 [[SRC_0]], 16 +; AGGRESSIVE-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32 +; AGGRESSIVE-NEXT: ret i32 [[TRUNC]] +; + %src.0 = extractelement <4 x i64> %src, i64 3 + %shr = lshr i64 %src.0, 16 + %trunc = trunc i64 %shr to i32 + ret i32 %trunc +} + +;; u0x0000ff00 = 65280 +;; u0x00ff0000 = 16711680 +;; u0xff0000ff = -16776961 +define i32 @shuffle_elts(<4 x i8> %vec) { +; LAZY-LABEL: define i32 @shuffle_elts( +; LAZY-SAME: <4 x i8> [[VEC:%.*]]) { +; LAZY-NEXT: [[VEC_0:%.*]] = extractelement <4 x i8> [[VEC]], i32 0 +; LAZY-NEXT: [[VEC_1:%.*]] = extractelement <4 x i8> [[VEC]], i32 1 +; LAZY-NEXT: [[VEC_2:%.*]] = extractelement <4 x i8> [[VEC]], i32 2 +; LAZY-NEXT: [[VEC_3:%.*]] = extractelement <4 x i8> [[VEC]], i32 3 +; LAZY-NEXT: [[ZEXT_0:%.*]] = zext i8 [[VEC_0]] to i32 +; LAZY-NEXT: [[ZEXT_1:%.*]] = zext i8 [[VEC_1]] to i32 +; LAZY-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16 +; LAZY-NEXT: [[OUT_1:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]] +; LAZY-NEXT: [[ZEXT_2:%.*]] = zext i8 [[VEC_2]] to i32 +; LAZY-NEXT: [[SHL_2:%.*]] = shl i32 [[ZEXT_2]], 8 +; LAZY-NEXT: [[ZEXT_3:%.*]] = zext i8 [[VEC_3]] to i32 +; LAZY-NEXT: [[SHL_3:%.*]] = shl i32 [[ZEXT_3]], 24 +; LAZY-NEXT: [[OUT_2:%.*]] = or i32 [[SHL_2]], [[SHL_3]] +; LAZY-NEXT: [[OUT:%.*]] = or i32 [[OUT_1]], [[OUT_2]] +; LAZY-NEXT: ret i32 [[OUT]] +; +; AGGRESSIVE-LABEL: define i32 @shuffle_elts( +; AGGRESSIVE-SAME: <4 x i8> [[VEC:%.*]]) { +; AGGRESSIVE-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x i8> [[VEC]] to i32 +; AGGRESSIVE-NEXT: [[VEC_MASK:%.*]] = and i32 [[VEC_CAST]], -16776961 +; AGGRESSIVE-NEXT: [[VEC_SHIFT:%.*]] = lshr i32 [[VEC_CAST]], 8 +; AGGRESSIVE-NEXT: [[VEC_MASK2:%.*]] = and i32 [[VEC_SHIFT]], 65280 +; AGGRESSIVE-NEXT: [[VEC_SHIFT4:%.*]] = shl i32 [[VEC_CAST]], 8 +; AGGRESSIVE-NEXT: [[VEC_MASK6:%.*]] = and i32 [[VEC_SHIFT4]], 16711680 +; AGGRESSIVE-NEXT: [[OUT_MERGE:%.*]] = or disjoint i32 [[VEC_MASK]], [[VEC_MASK2]] +; AGGRESSIVE-NEXT: [[OUT_MERGE8:%.*]] = or disjoint i32 [[OUT_MERGE]], [[VEC_MASK6]] +; AGGRESSIVE-NEXT: ret i32 [[OUT_MERGE8]] +; + %vec.0 = extractelement <4 x i8> %vec, i32 0 + %vec.1 = extractelement <4 x i8> %vec, i32 1 + %vec.2 = extractelement <4 x i8> %vec, i32 2 + %vec.3 = extractelement <4 x i8> %vec, i32 3 + + %zext.0 = zext i8 %vec.0 to i32 + + %zext.1 = zext i8 %vec.1 to i32 + %shl.1 = shl i32 %zext.1, 16 + %out.1 = or i32 %zext.0, %shl.1 + + %zext.2 = zext i8 %vec.2 to i32 + %shl.2 = shl i32 %zext.2, 8 + + %zext.3 = zext i8 %vec.3 to i32 + %shl.3 = shl i32 %zext.3, 24 + %out.2 = or i32 %shl.2, %shl.3 + + %out = or i32 %out.1, %out.2 + + ret i32 %out +} diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll new file mode 100644 index 0000000000000..5baefc7fb6cda --- /dev/null +++ b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll @@ -0,0 +1,294 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE + +define <4 x i8> @obtain_v4i8(<2 x i16> %from) { +; LAZY-LABEL: define <4 x i8> @obtain_v4i8( +; LAZY-SAME: <2 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8> +; LAZY-NEXT: ret <4 x i8> [[FROM_CAST]] +; +; AGGRESSIVE-LABEL: define <4 x i8> @obtain_v4i8( +; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8> +; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_CAST]] +; + %from.0 = extractelement <2 x i16> %from, i64 0 + %mask.0 = trunc i16 %from.0 to i8 + %shr.1 = lshr i16 %from.0, 8 + %mask.1 = trunc i16 %shr.1 to i8 + + %from.1 = extractelement <2 x i16> %from, i64 1 + %mask.2 = trunc i16 %from.1 to i8 + %shr.3 = lshr i16 %from.1, 8 + %mask.3 = trunc i16 %shr.3 to i8 + + %build.0 = insertelement <4 x i8> poison, i8 %mask.0, i64 0 + %build.1 = insertelement <4 x i8> %build.0, i8 %mask.1, i64 1 + %build.2 = insertelement <4 x i8> %build.1, i8 %mask.2, i64 2 + %build.3 = insertelement <4 x i8> %build.2, i8 %mask.3, i64 3 + + ret <4 x i8> %build.3 +} + +define <2 x i16> @obtain_v2i16(<4 x i8> %from) { +; LAZY-LABEL: define <2 x i16> @obtain_v2i16( +; LAZY-SAME: <4 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16> +; LAZY-NEXT: ret <2 x i16> [[FROM_CAST]] +; +; AGGRESSIVE-LABEL: define <2 x i16> @obtain_v2i16( +; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16> +; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_CAST]] +; + %from.0 = extractelement <4 x i8> %from, i64 0 + %zext.0 = zext i8 %from.0 to i16 + + %from.1 = extractelement <4 x i8> %from, i64 1 + %zext.1 = zext i8 %from.1 to i16 + %shl.1 = shl i16 %zext.1, 8 + %out.1 = or i16 %zext.0, %shl.1 + + %from.2 = extractelement <4 x i8> %from, i64 2 + %zext.2 = zext i8 %from.2 to i16 + + %from.3 = extractelement <4 x i8> %from, i64 3 + %zext.3 = zext i8 %from.3 to i16 + %shl.3 = shl i16 %zext.3, 8 + %out.2 = or i16 %zext.2, %shl.3 + + %build.0 = insertelement <2 x i16> poison, i16 %out.1, i64 0 + %build.1 = insertelement <2 x i16> %build.0, i16 %out.2, i64 1 + + ret <2 x i16> %build.1 +} + +define <4 x i8> @obtain_v4i8_masked(<2 x i16> %from) { +; LAZY-LABEL: define <4 x i8> @obtain_v4i8_masked( +; LAZY-SAME: <2 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> +; LAZY-NEXT: ret <4 x i8> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <4 x i8> @obtain_v4i8_masked( +; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i8> [[FROM_SHUFFLE]] +; + %from.0 = extractelement <2 x i16> %from, i64 0 + %mask.0 = trunc i16 %from.0 to i8 + %shr.1 = lshr i16 %from.0, 8 + %mask.1 = trunc i16 %shr.1 to i8 + + %from.1 = extractelement <2 x i16> %from, i64 1 + %shr.3 = lshr i16 %from.1, 8 + %mask.3 = trunc i16 %shr.3 to i8 + + %build.0 = insertelement <4 x i8> , i8 %mask.0, i64 0 + %build.1 = insertelement <4 x i8> %build.0, i8 %mask.1, i64 1 + %build.3 = insertelement <4 x i8> %build.1, i8 %mask.3, i64 3 + + ret <4 x i8> %build.3 +} + +define <2 x i16> @obtain_v2i16_masked(<4 x i8> %from) { +; LAZY-LABEL: define <2 x i16> @obtain_v2i16_masked( +; LAZY-SAME: <4 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16> +; LAZY-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], +; LAZY-NEXT: ret <2 x i16> [[FROM_MASK]] +; +; AGGRESSIVE-LABEL: define <2 x i16> @obtain_v2i16_masked( +; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16> +; AGGRESSIVE-NEXT: [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], +; AGGRESSIVE-NEXT: ret <2 x i16> [[FROM_MASK]] +; + %from.0 = extractelement <4 x i8> %from, i64 0 + %zext.0 = zext i8 %from.0 to i16 + + %from.1 = extractelement <4 x i8> %from, i64 1 + %zext.1 = zext i8 %from.1 to i16 + %shl.1 = shl i16 %zext.1, 8 + %out.1 = or i16 %zext.0, %shl.1 + + %from.3 = extractelement <4 x i8> %from, i64 3 + %zext.3 = zext i8 %from.3 to i16 + %shl.3 = shl i16 %zext.3, 8 + + %build.0 = insertelement <2 x i16> poison, i16 %out.1, i64 0 + %build.1 = insertelement <2 x i16> %build.0, i16 %shl.3, i64 1 + + ret <2 x i16> %build.1 +} + +define <8 x i8> @obtain_v4i8_shifted(<2 x i16> %from) { +; LAZY-LABEL: define <8 x i8> @obtain_v4i8_shifted( +; LAZY-SAME: <2 x i16> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8> +; LAZY-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> poison, <8 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: ret <8 x i8> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <8 x i8> @obtain_v4i8_shifted( +; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8> +; AGGRESSIVE-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> poison, <8 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: ret <8 x i8> [[FROM_SHUFFLE]] +; + %from.0 = extractelement <2 x i16> %from, i64 0 + %mask.0 = trunc i16 %from.0 to i8 + %shr.1 = lshr i16 %from.0, 8 + %mask.1 = trunc i16 %shr.1 to i8 + + %from.1 = extractelement <2 x i16> %from, i64 1 + %mask.2 = trunc i16 %from.1 to i8 + %shr.3 = lshr i16 %from.1, 8 + %mask.3 = trunc i16 %shr.3 to i8 + + %build.0 = insertelement <8 x i8> , i8 %mask.0, i64 4 + %build.1 = insertelement <8 x i8> %build.0, i8 %mask.1, i64 5 + %build.2 = insertelement <8 x i8> %build.1, i8 %mask.2, i64 6 + %build.3 = insertelement <8 x i8> %build.2, i8 %mask.3, i64 7 + + ret <8 x i8> %build.3 +} + +define <4 x i16> @obtain_v2i16_shifted(<4 x i8> %from) { +; LAZY-LABEL: define <4 x i16> @obtain_v2i16_shifted( +; LAZY-SAME: <4 x i8> [[FROM:%.*]]) { +; LAZY-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16> +; LAZY-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <2 x i16> [[FROM_CAST]], <2 x i16> poison, <4 x i32> +; LAZY-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: ret <4 x i16> [[FROM_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <4 x i16> @obtain_v2i16_shifted( +; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) { +; AGGRESSIVE-NEXT: [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16> +; AGGRESSIVE-NEXT: [[FROM_EXTRACT:%.*]] = shufflevector <2 x i16> [[FROM_CAST]], <2 x i16> poison, <4 x i32> +; AGGRESSIVE-NEXT: [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i16> [[FROM_SHUFFLE]] +; + %from.0 = extractelement <4 x i8> %from, i64 0 + %zext.0 = zext i8 %from.0 to i16 + + %from.1 = extractelement <4 x i8> %from, i64 1 + %zext.1 = zext i8 %from.1 to i16 + %shl.1 = shl i16 %zext.1, 8 + %out.1 = or i16 %zext.0, %shl.1 + + %from.2 = extractelement <4 x i8> %from, i64 2 + %zext.2 = zext i8 %from.2 to i16 + + %from.3 = extractelement <4 x i8> %from, i64 3 + %zext.3 = zext i8 %from.3 to i16 + %shl.3 = shl i16 %zext.3, 8 + %out.2 = or i16 %zext.2, %shl.3 + + %build.0 = insertelement <4 x i16> , i16 %out.1, i64 2 + %build.1 = insertelement <4 x i16> %build.0, i16 %out.2, i64 3 + + ret <4 x i16> %build.1 +} + +define <8 x i8> @combine_v4i8(<4 x i8> %bot, <2 x i16> %top) { +; LAZY-LABEL: define <8 x i8> @combine_v4i8( +; LAZY-SAME: <4 x i8> [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) { +; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast <2 x i16> [[TOP]] to <4 x i8> +; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> +; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> +; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> +; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> +; LAZY-NEXT: ret <8 x i8> [[TOP_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <8 x i8> @combine_v4i8( +; AGGRESSIVE-SAME: <4 x i8> [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) { +; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast <2 x i16> [[TOP]] to <4 x i8> +; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> +; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> +; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> +; AGGRESSIVE-NEXT: ret <8 x i8> [[TOP_SHUFFLE]] +; + %base = shufflevector <4 x i8> %bot, <4 x i8> poison, <8 x i32> + + %top.0 = extractelement <2 x i16> %top, i64 0 + %mask.0 = trunc i16 %top.0 to i8 + %shr.1 = lshr i16 %top.0, 8 + %mask.1 = trunc i16 %shr.1 to i8 + + %top.1 = extractelement <2 x i16> %top, i64 1 + %mask.2 = trunc i16 %top.1 to i8 + %shr.3 = lshr i16 %top.1, 8 + %mask.3 = trunc i16 %shr.3 to i8 + + %build.0 = insertelement <8 x i8> %base, i8 %mask.0, i64 4 + %build.1 = insertelement <8 x i8> %build.0, i8 %mask.1, i64 5 + %build.2 = insertelement <8 x i8> %build.1, i8 %mask.2, i64 6 + %build.3 = insertelement <8 x i8> %build.2, i8 %mask.3, i64 7 + + ret <8 x i8> %build.3 +} + +define <4 x i16> @combine_v2i16(<2 x i16> %bot, <4 x i8> %top) { +; LAZY-LABEL: define <4 x i16> @combine_v2i16( +; LAZY-SAME: <2 x i16> [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) { +; LAZY-NEXT: [[TOP_CAST:%.*]] = bitcast <4 x i8> [[TOP]] to <2 x i16> +; LAZY-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> +; LAZY-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> +; LAZY-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> +; LAZY-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> +; LAZY-NEXT: ret <4 x i16> [[TOP_SHUFFLE]] +; +; AGGRESSIVE-LABEL: define <4 x i16> @combine_v2i16( +; AGGRESSIVE-SAME: <2 x i16> [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) { +; AGGRESSIVE-NEXT: [[TOP_CAST:%.*]] = bitcast <4 x i8> [[TOP]] to <2 x i16> +; AGGRESSIVE-NEXT: [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> +; AGGRESSIVE-NEXT: [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> +; AGGRESSIVE-NEXT: [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i16> [[TOP_SHUFFLE]] +; + %base = shufflevector <2 x i16> %bot, <2 x i16> poison, <4 x i32> + + %top.0 = extractelement <4 x i8> %top, i64 0 + %zext.0 = zext i8 %top.0 to i16 + + %top.1 = extractelement <4 x i8> %top, i64 1 + %zext.1 = zext i8 %top.1 to i16 + %shl.1 = shl i16 %zext.1, 8 + %out.1 = or i16 %zext.0, %shl.1 + + %top.2 = extractelement <4 x i8> %top, i64 2 + %zext.2 = zext i8 %top.2 to i16 + + %top.3 = extractelement <4 x i8> %top, i64 3 + %zext.3 = zext i8 %top.3 to i16 + %shl.3 = shl i16 %zext.3, 8 + %out.2 = or i16 %zext.2, %shl.3 + + %build.0 = insertelement <4 x i16> %base, i16 %out.1, i64 2 + %build.1 = insertelement <4 x i16> %build.0, i16 %out.2, i64 3 + + ret <4 x i16> %build.1 +} + +define <4 x i8> @shuffle_elts(<4 x i8> %vec) { +; LAZY-LABEL: define <4 x i8> @shuffle_elts( +; LAZY-SAME: <4 x i8> [[VEC:%.*]]) { +; LAZY-NEXT: [[SHUFFLED:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> poison, <4 x i32> +; LAZY-NEXT: ret <4 x i8> [[SHUFFLED]] +; +; AGGRESSIVE-LABEL: define <4 x i8> @shuffle_elts( +; AGGRESSIVE-SAME: <4 x i8> [[VEC:%.*]]) { +; AGGRESSIVE-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> zeroinitializer, <4 x i32> +; AGGRESSIVE-NEXT: [[VEC_SHUFFLE2:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> [[VEC_SHUFFLE]], <4 x i32> +; AGGRESSIVE-NEXT: [[VEC_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> [[VEC_SHUFFLE2]], <4 x i32> +; AGGRESSIVE-NEXT: ret <4 x i8> [[VEC_SHUFFLE4]] +; + %shuffled = shufflevector <4 x i8> %vec, <4 x i8> poison, <4 x i32> + ret <4 x i8> %shuffled +} From 932bc6bf48264f927a3e051d5273808ee891ba8d Mon Sep 17 00:00:00 2001 From: Zach Goldthorpe Date: Thu, 3 Jul 2025 18:42:57 -0500 Subject: [PATCH 2/2] Address a subset of the reviewer feedback. --- llvm/include/llvm/Transforms/Scalar.h | 4 +- .../Scalar/PackedIntegerCombinePass.h | 18 +- llvm/lib/Passes/PassBuilder.cpp | 29 ++ llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +- llvm/lib/Passes/PassRegistry.def | 7 +- .../Scalar/PackedIntegerCombinePass.cpp | 392 ++++++++---------- .../PackedIntegerCombine/instructions.ll | 51 +-- .../PackedIntegerCombine/int2int.ll | 4 +- .../PackedIntegerCombine/int2vec.ll | 4 +- .../PackedIntegerCombine/vec2int.ll | 4 +- .../PackedIntegerCombine/vec2vec.ll | 4 +- 11 files changed, 228 insertions(+), 291 deletions(-) diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index ec9d89507c375..bd5b112ed6105 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -159,7 +159,9 @@ LLVM_ABI extern char &InferAddressSpacesID; // PackedIntegerCombinePass - Tracks individual bytes through instructions to // systematically identify redundant byte packing or unpacking operations. // -LLVM_ABI FunctionPass *createPackedIntegerCombinePass(); +LLVM_ABI FunctionPass * +createPackedIntegerCombinePass(unsigned MaxCollectionIterations = 2, + bool AggressiveRewriting = false); //===----------------------------------------------------------------------===// // diff --git a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h index a5916e2e611cf..2ca56c06784dc 100644 --- a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h +++ b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h @@ -21,10 +21,26 @@ namespace llvm { +struct PackedIntegerCombineOptions { + /// Maximum number of iterations to isolate final packed instructions. + unsigned MaxCollectionIterations = 2; + /// Aggressively rewrite packed instructions. + bool AggressiveRewriting = false; +}; + class PackedIntegerCombinePass : public PassInfoMixin { + + PackedIntegerCombineOptions Options; + public: - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + PackedIntegerCombinePass(PackedIntegerCombineOptions Options = {}) + : Options(Options) {} + + LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + LLVM_ABI void + printPipeline(raw_ostream &OS, + function_ref MapClassName2PassName); }; } // end namespace llvm diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 7a382ace34dbc..911a043fbae53 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1170,6 +1170,35 @@ Expected parseMergedLoadStoreMotionOptions(StringRef Params) { return Result; } +Expected +parsePackedIntegerCombineOptions(StringRef Params) { + PackedIntegerCombineOptions Options; + while (!Params.empty()) { + StringRef ParamName; + std::tie(ParamName, Params) = Params.split(';'); + + if (ParamName.consume_front("max-iterations=")) { + if (ParamName.getAsInteger(0, Options.MaxCollectionIterations)) + return make_error( + formatv("invalid max iteration count for PackedIntegerCombine " + "pass: '{}'", + ParamName) + .str(), + inconvertibleErrorCode()); + } else if (ParamName == "aggressive") { + Options.AggressiveRewriting = true; + } else { + return make_error( + formatv("invalid argument for PackedIntegerCombinePass: '{}'", + ParamName) + .str(), + inconvertibleErrorCode()); + } + } + + return Options; +} + Expected parseGVNOptions(StringRef Params) { GVNOptions Result; while (!Params.empty()) { diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 2da72606bc47a..9b2254607d954 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -2361,4 +2361,4 @@ AAManager PassBuilder::buildDefaultAAPipeline() { bool PassBuilder::isInstrumentedPGOUse() const { return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) || !UseCtxProfile.empty(); -} \ No newline at end of file +} diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 6f1c405a5efa7..812eb5d3c15c4 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -476,7 +476,6 @@ FUNCTION_PASS("objc-arc", ObjCARCOptPass()) FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass()) FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass()) FUNCTION_PASS("pa-eval", PAEvalPass()) -FUNCTION_PASS("packedintcombine", PackedIntegerCombinePass()) FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass()) FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt()) FUNCTION_PASS("place-safepoints", PlaceSafepointsPass()) @@ -628,6 +627,12 @@ FUNCTION_PASS_WITH_PARAMS( return MergedLoadStoreMotionPass(Opts); }, parseMergedLoadStoreMotionOptions, "no-split-footer-bb;split-footer-bb") +FUNCTION_PASS_WITH_PARAMS( + "packed-integer-combine", "PackedIntegerCombinePass", + [](PackedIntegerCombineOptions Options) { + return PackedIntegerCombinePass(Options); + }, + parsePackedIntegerCombineOptions, "max-iterations=N;aggressive") FUNCTION_PASS_WITH_PARAMS( "print", "LoopAccessInfoPrinterPass", [](bool AllowPartial) { diff --git a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp index 31edd28069a2b..fc3b57b71bf45 100644 --- a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp +++ b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// /// -/// This file provides the interface for LLVM's Packed Integer Combine pass. +/// This file implements the interface for LLVM's Packed Integer Combine pass. /// This pass tries to treat integers as packed chunks of individual bytes, /// and leverage this to coalesce needlessly fragmented /// computations. @@ -24,21 +24,11 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" +#include using namespace llvm; -#define DEBUG_TYPE "packedintcombine" - -static cl::opt MaxCollectionIterations( - "packedint-max-iterations", - cl::desc("Maximum number of iterations to isolate final packed " - "instructions. Set to 0 to iterate until convergence."), - cl::init(2), cl::Hidden); - -static cl::opt - AggressiveRewriting("packedint-aggressive-rewriter", - cl::desc("Aggressively rewrite packed instructions."), - cl::init(false), cl::Hidden); +#define DEBUG_TYPE "packed-integer-combine" namespace { @@ -101,10 +91,10 @@ class Byte { else Base->printAsOperand(ROS, false); - ROS << "[" << Integer << "]"; + ROS << '[' << Integer << ']'; if (NewLine) - ROS << "\n"; + ROS << '\n'; } LLVM_DUMP_METHOD void dump() const { print(errs(), true); } @@ -128,7 +118,7 @@ struct ByteLayout { }; /// Interpret the given type as a number of packed bytes, if possible. -static std::optional tryGetByteLayout(const Type *Ty) { +static std::optional getByteLayout(const Type *Ty) { unsigned IntBitWidth, NumElts; if (const auto *IntTy = dyn_cast(Ty)) { IntBitWidth = IntTy->getBitWidth(); @@ -148,13 +138,6 @@ static std::optional tryGetByteLayout(const Type *Ty) { return ByteLayout{IntBitWidth / Byte::BitWidth, NumElts}; } -/// Interpret the given type as a number of backed bytes (aborts if impossible). -static ByteLayout getByteLayout(const Type *Ty) { - const std::optional Layout = tryGetByteLayout(Ty); - assert(Layout); - return *Layout; -} - /// A convenience class for combining Byte instances obtained from the same base /// value, and with a common relative offset, which can hence be obtained /// simultaneously. @@ -167,7 +150,7 @@ struct CoalescedBytes { /// /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide /// bytes 0, 1, 2 of the target %tgt, then ShrByteOffset = 3. - signed SignedShrByteOffset; + int SignedShrByteOffset; /// The bitmask identifying which bytes of the target value are covered by /// these coalesced bytes. /// @@ -176,12 +159,12 @@ struct CoalescedBytes { /// be set, corresponding to the first three bits of %tgt. SmallBitVector Mask; - explicit CoalescedBytes(Value &Base, signed Offset, SmallBitVector Mask) + explicit CoalescedBytes(Value &Base, int Offset, SmallBitVector Mask) : Base(&Base), SignedShrByteOffset(Offset), Mask(Mask) {} - explicit CoalescedBytes(Value &Base, signed Offset, unsigned NumBytes) + explicit CoalescedBytes(Value &Base, int Offset, unsigned NumBytes) : Base(&Base), SignedShrByteOffset(Offset), Mask(NumBytes) {} - bool alignsWith(Value *V, signed VOffset) const { + bool alignsWith(Value *V, int VOffset) const { return Base == V && SignedShrByteOffset == VOffset; } @@ -206,16 +189,16 @@ struct CoalescedBytes { for (unsigned Idx = 0; Idx < Mask.size(); ++Idx) { if (Mask.test(Idx)) { Base->printAsOperand(ROS, false); - ROS << "[" << (static_cast(Idx) + SignedShrByteOffset) << "]"; + ROS << '[' << (static_cast(Idx) + SignedShrByteOffset) << ']'; } else ROS << 0; ROS << "; "; } - ROS << "}"; + ROS << '}'; if (NewLine) - ROS << "\n"; + ROS << '\n'; } LLVM_DUMP_METHOD void dump() const { print(errs(), true); } @@ -254,67 +237,23 @@ using ByteVector = SmallVector; /// The decomposition of an IR value into its individual bytes, tracking where /// each byte is obtained. -class ByteDefinition { - /// Enum classifying what Ptr points to. - enum ByteType : uint8_t { - /// Ptr's value is undefined. - INVALID, - /// The byte definition is given by a ByteVector, which is referenced (but - /// not captured) by Ptr. - VECTOR, - /// The bytes are obtained from a (currently opaque) IR value, held by Ptr. - VALUE, - /// The bytes are obtained from a constant integer, held by Ptr. - CONST_INT, - /// The bytes are obtained from a constant vector of integers, held by Ptr. - CONST_VEC, - }; - - ByteType DefType; - void *Ptr; +struct ByteDefinition { + std::variant Ptr; ByteLayout Layout; - ByteDefinition(ByteType DefType, void *Ptr, ByteLayout Layout) - : DefType(DefType), Ptr(Ptr), Layout(Layout) {} public: /// Indicate that a value cannot be decomposed into bytes in a known way. - static ByteDefinition invalid() { return {INVALID, nullptr, {0, 0}}; } + static ByteDefinition invalid() { return {std::nullopt, {0, 0}}; } /// Indicate that a value's bytes are known, and track their producers. static ByteDefinition vector(ByteVector &Ref, ByteLayout Layout) { - return {VECTOR, &Ref, Layout}; + return {&Ref, Layout}; } /// Indicate that a value's bytes are opaque. static ByteDefinition value(Value &V) { - return {VALUE, &V, getByteLayout(V.getType())}; - } - /// Indicate that the bytes come from a constant integer. - static ByteDefinition constInt(ConstantInt &Int) { - return {CONST_INT, &Int, getByteLayout(Int.getType())}; - } - /// Indicate that the bytes come from a constant vector of integers. - static ByteDefinition constVec(Constant &Vec) { - assert(Vec.getType()->isVectorTy()); - return {CONST_VEC, &Vec, getByteLayout(Vec.getType())}; - } - - ByteVector &getVector() const { - assert(DefType == VECTOR); - return *static_cast(Ptr); - } - Value &getValue() const { - assert(DefType == VALUE); - return *static_cast(Ptr); - } - ConstantInt &getConstInt() const { - assert(DefType == CONST_INT); - return *static_cast(Ptr); - } - Constant &getConstVec() const { - assert(DefType == CONST_VEC); - return *static_cast(Ptr); + return {&V, *getByteLayout(V.getType())}; } - bool isValid() const { return DefType != INVALID; } + bool isValid() const { return !std::holds_alternative(Ptr); } /// Return true iff the byte definition is valid. operator bool() const { return isValid(); } @@ -322,62 +261,65 @@ class ByteDefinition { /// Get the definition of the byte at the specified byte offset, where 0 is /// the least significant byte. Byte getByte(unsigned Idx) const { - switch (DefType) { - default: - llvm_unreachable("Invalid byte definition"); - case VECTOR: - return getVector()[Idx].getByte(); - case VALUE: - return Byte(getValue(), Idx); - case CONST_INT: - return Byte(getConstInt().getValue().extractBitsAsZExtValue( - Byte::BitWidth, Idx * Byte::BitWidth)); - case CONST_VEC: { - const auto &Vec = getConstVec(); - const ByteLayout Layout = getByteLayout(Vec.getType()); - const unsigned VecIdx = Idx / Layout.NumBytesPerElement; - const unsigned EltIdx = Idx % Layout.NumBytesPerElement; - - Constant *Elt = Vec.getAggregateElement(VecIdx); - if (const auto *Int = dyn_cast(Elt)) - return Byte(Int->getValue().extractBitsAsZExtValue( - Byte::BitWidth, EltIdx * Byte::BitWidth)); - - return Byte(*Elt, EltIdx); - } - } + struct Visitor { + unsigned Idx; + + Byte operator()(std::nullopt_t) { + llvm_unreachable("Invalid byte definition"); + } + Byte operator()(ByteVector *BV) { return (*BV)[Idx].getByte(); } + Byte operator()(Value *V) { + if (auto *Int = dyn_cast(V)) + return Byte(Int->getValue().extractBitsAsZExtValue( + Byte::BitWidth, Idx * Byte::BitWidth)); + + if (V->getType()->isVectorTy()) { + if (auto *Vec = dyn_cast(V)) { + const ByteLayout Layout = *getByteLayout(Vec->getType()); + const unsigned VecIdx = Idx / Layout.NumBytesPerElement; + const unsigned EltIdx = Idx % Layout.NumBytesPerElement; + + if (Constant *Elt = Vec->getAggregateElement(VecIdx)) { + if (const auto *Int = dyn_cast(Elt)) + return Byte(Int->getValue().extractBitsAsZExtValue( + Byte::BitWidth, EltIdx * Byte::BitWidth)); + + return Byte(*Elt, EltIdx); + } + } + } + + return Byte(*V, Idx); + } + }; + + return std::visit(Visitor{Idx}, Ptr); } const ByteLayout &getLayout() const { return Layout; } void print(raw_ostream &ROS, bool NewLine = true) const { - switch (DefType) { - default: - ROS << "[INVALID]"; - break; - case VECTOR: { - ByteVector &BV = getVector(); - ROS << "{ "; - for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx) - ROS << ByteIdx << ": " << BV[ByteIdx].getByte() << "; "; - ROS << "}"; - break; - } - case VALUE: - ROS << "("; - getValue().printAsOperand(ROS); - ROS << ")[0:" << Layout.getNumBytes() << "]"; - break; - case CONST_INT: - ROS << getConstInt(); - break; - case CONST_VEC: - ROS << getConstVec(); - break; - } + struct Visitor { + raw_ostream &ROS; + const ByteLayout &Layout; + + void operator()(std::nullopt_t) { ROS << "[INVALID]"; } + void operator()(ByteVector *BV) { + ROS << "{ "; + for (unsigned ByteIdx = 0; ByteIdx < BV->size(); ++ByteIdx) + ROS << ByteIdx << ": " << (*BV)[ByteIdx].getByte() << "; "; + ROS << '}'; + } + void operator()(Value *V) { + ROS << '('; + V->printAsOperand(ROS); + ROS << ")[0:" << Layout.getNumBytes() << ']'; + } + }; + std::visit(Visitor{ROS, Layout}, Ptr); if (NewLine) - ROS << "\n"; + ROS << '\n'; } LLVM_DUMP_METHOD void dump() const { print(errs(), true); } @@ -415,7 +357,6 @@ class ByteExpander final : public InstVisitor { // Visitation implementations return `true` iff a new byte definition was // successfully constructed. - ByteVector visitAdd(BinaryOperator &I); ByteVector visitAnd(BinaryOperator &I); ByteVector visitOr(BinaryOperator &I); ByteVector visitXor(BinaryOperator &I); @@ -455,47 +396,18 @@ class ByteExpander final : public InstVisitor { /// Iterate over all instructions in a function over several passes to /// identify all final values and their byte definitions. - std::vector collectPIICandidates(Function &F); + std::vector + collectPIICandidates(Function &F, unsigned MaxCollectionIterations); }; -ByteVector ByteExpander::visitAdd(BinaryOperator &I) { - const ByteDefinition LhsDef = - getByteDefinitionIfIntermediateOperand(I.getOperand(0)); +ByteVector ByteExpander::visitAnd(BinaryOperator &I) { const ByteDefinition RhsDef = getByteDefinitionIfIntermediateOperand(I.getOperand(1)); - if (!LhsDef || !RhsDef) + if (!RhsDef) return {}; - - const ByteLayout &Layout = LhsDef.getLayout(); - const unsigned NumBytes = Layout.getNumBytes(); - - ByteVector BV; - BV.reserve(NumBytes); - - for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) { - const Byte Lhs = LhsDef.getByte(ByteIdx); - const Byte Rhs = RhsDef.getByte(ByteIdx); - - const bool LhsIsZero = Lhs.isConstant() && Lhs.getConstant() == 0; - const bool RhsIsZero = Rhs.isConstant() && Rhs.getConstant() == 0; - if (LhsIsZero) - BV.emplace_back(Rhs, RhsIsZero ? ByteUse::AllOperands : 1); - else if (RhsIsZero) - BV.emplace_back(Lhs, 0); - else - return {}; - } - - assert(BV.size() == NumBytes); - return BV; -} - -ByteVector ByteExpander::visitAnd(BinaryOperator &I) { const ByteDefinition LhsDef = getByteDefinitionIfIntermediateOperand(I.getOperand(0)); - const ByteDefinition RhsDef = - getByteDefinitionIfIntermediateOperand(I.getOperand(1)); - if (!LhsDef || !RhsDef) + if (!LhsDef) return {}; const ByteLayout &Layout = LhsDef.getLayout(); @@ -546,11 +458,13 @@ ByteVector ByteExpander::visitAnd(BinaryOperator &I) { } ByteVector ByteExpander::visitOr(BinaryOperator &I) { - const ByteDefinition LhsDef = - getByteDefinitionIfIntermediateOperand(I.getOperand(0)); const ByteDefinition RhsDef = getByteDefinitionIfIntermediateOperand(I.getOperand(1)); - if (!LhsDef || !RhsDef) + if (!RhsDef) + return {}; + const ByteDefinition LhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!LhsDef) return {}; const ByteLayout &Layout = LhsDef.getLayout(); @@ -598,11 +512,13 @@ ByteVector ByteExpander::visitOr(BinaryOperator &I) { } ByteVector ByteExpander::visitXor(BinaryOperator &I) { - const ByteDefinition LhsDef = - getByteDefinitionIfIntermediateOperand(I.getOperand(0)); const ByteDefinition RhsDef = getByteDefinitionIfIntermediateOperand(I.getOperand(1)); - if (!LhsDef || !RhsDef) + if (!RhsDef) + return {}; + const ByteDefinition LhsDef = + getByteDefinitionIfIntermediateOperand(I.getOperand(0)); + if (!LhsDef) return {}; const ByteLayout &Layout = LhsDef.getLayout(); @@ -629,6 +545,10 @@ ByteVector ByteExpander::visitXor(BinaryOperator &I) { } ByteVector ByteExpander::visitShl(BinaryOperator &I) { + const auto *Const = dyn_cast(I.getOperand(1)); + if (!Const) + return {}; + const ByteDefinition BaseDef = getByteDefinitionIfIntermediateOperand(I.getOperand(0)); if (!BaseDef) @@ -636,10 +556,6 @@ ByteVector ByteExpander::visitShl(BinaryOperator &I) { const unsigned NumBytes = BaseDef.getLayout().getNumBytes(); - const auto *Const = dyn_cast(I.getOperand(1)); - if (!Const) - return {}; - if (isa(Const)) { const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue(); if (ShAmt % Byte::BitWidth != 0) @@ -683,6 +599,10 @@ ByteVector ByteExpander::visitShl(BinaryOperator &I) { } ByteVector ByteExpander::visitLShr(BinaryOperator &I) { + const auto *Const = dyn_cast(I.getOperand(1)); + if (!Const) + return {}; + const ByteDefinition BaseDef = getByteDefinitionIfIntermediateOperand(I.getOperand(0)); if (!BaseDef) @@ -690,10 +610,6 @@ ByteVector ByteExpander::visitLShr(BinaryOperator &I) { const unsigned NumBytes = BaseDef.getLayout().getNumBytes(); - const auto *Const = dyn_cast(I.getOperand(1)); - if (!Const) - return {}; - if (isa(Const)) { const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue(); if (ShAmt % Byte::BitWidth != 0) @@ -739,12 +655,12 @@ ByteVector ByteExpander::visitLShr(BinaryOperator &I) { } ByteVector ByteExpander::visitTruncInst(TruncInst &I) { - const std::optional Layout = tryGetByteLayout(I.getType()); + const std::optional Layout = getByteLayout(I.getType()); if (!Layout) return {}; const std::optional SrcLayout = - tryGetByteLayout(I.getOperand(0)->getType()); + getByteLayout(I.getOperand(0)->getType()); if (!SrcLayout) return {}; @@ -768,7 +684,7 @@ ByteVector ByteExpander::visitTruncInst(TruncInst &I) { } ByteVector ByteExpander::visitZExtInst(ZExtInst &I) { - const std::optional Layout = tryGetByteLayout(I.getType()); + const std::optional Layout = getByteLayout(I.getType()); if (!Layout) return {}; @@ -798,7 +714,7 @@ ByteVector ByteExpander::visitZExtInst(ZExtInst &I) { } ByteVector ByteExpander::visitBitCastInst(BitCastInst &I) { - const std::optional Layout = tryGetByteLayout(I.getType()); + const std::optional Layout = getByteLayout(I.getType()); if (!Layout) return {}; @@ -875,7 +791,7 @@ ByteVector ByteExpander::visitInsertElementInst(InsertElementInst &I) { } ByteVector ByteExpander::visitShuffleVectorInst(ShuffleVectorInst &I) { - const std::optional Layout = tryGetByteLayout(I.getType()); + const std::optional Layout = getByteLayout(I.getType()); if (!Layout) return {}; @@ -961,16 +877,10 @@ ByteVector *ByteExpander::expandByteDefinition(Value *V) { } ByteDefinition ByteExpander::getByteDefinition(Value *V, bool ExpandDef) { - const std::optional Layout = tryGetByteLayout(V->getType()); + const std::optional Layout = getByteLayout(V->getType()); if (!Layout) return ByteDefinition::invalid(); - if (auto *ConstInt = dyn_cast(V)) - return ByteDefinition::constInt(*ConstInt); - if (auto *Const = dyn_cast(V)) - if (Const->getType()->isVectorTy()) - return ByteDefinition::constVec(*Const); - if (ExpandDef) if (ByteVector *BV = expandByteDefinition(V)) return ByteDefinition::vector(*BV, *Layout); @@ -986,7 +896,7 @@ bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) { if (isa(V)) return true; - /// Short-circuit check. + // Short-circuit check. if (IsOperand && V->hasOneUse()) return true; @@ -997,13 +907,14 @@ bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) { return Definitions.contains(*FU.begin()); } -std::vector ByteExpander::collectPIICandidates(Function &F) { +std::vector +ByteExpander::collectPIICandidates(Function &F, + unsigned MaxCollectionIterations) { std::vector PackedIntInsts; - LLVM_DEBUG(dbgs() << "PICP: Entering function " << F.getName() << "\n"); unsigned NumIterations = 1; for (;;) { - LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << "\n"); + LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << '\n'); bool Converged = true; std::vector CollectedInsts; @@ -1034,7 +945,7 @@ std::vector ByteExpander::collectPIICandidates(Function &F) { LLVM_DEBUG({ dbgs() << "PICP: Updating definition: "; I.printAsOperand(dbgs()); - dbgs() << " = " << getByteDefinition(&I) << "\n"; + dbgs() << " = " << getByteDefinition(&I) << '\n'; }); } } @@ -1056,7 +967,7 @@ std::vector ByteExpander::collectPIICandidates(Function &F) { ++NumIterations; } - LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << "\n"); + LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << '\n'); return PackedIntInsts; } @@ -1203,7 +1114,7 @@ class BytePackFolder { LLVM_DEBUG({ dbgs() << "PICP ["; TargetInst->printAsOperand(dbgs()); - dbgs() << "]: Queuing cast " << *CI << "\n"; + dbgs() << "]: Queuing cast " << *CI << '\n'; }); return CI; } @@ -1216,7 +1127,7 @@ class BytePackFolder { LLVM_DEBUG({ dbgs() << "PICP ["; TargetInst->printAsOperand(dbgs()); - dbgs() << "]: Queuing inst " << *I << "\n"; + dbgs() << "]: Queuing inst " << *I << '\n'; }); return I; } @@ -1346,7 +1257,7 @@ class BytePackFolder { const unsigned NumTargetBytes = Layout.getNumBytes(); Value *V = CB.Base; - const unsigned NumSrcBytes = getByteLayout(V->getType()).getNumBytes(); + const unsigned NumSrcBytes = getByteLayout(V->getType())->getNumBytes(); const StringRef &Name = V->getName(); // Transformation: shr -> trunc -> mask -> zext -> shl @@ -1418,7 +1329,7 @@ class BytePackFolder { const unsigned NumTargetBytes = Layout.getNumBytes(); Value *V = CB.Base; const StringRef &Name = V->getName(); - ByteLayout VecLayout = getByteLayout(V->getType()); + ByteLayout VecLayout = *getByteLayout(V->getType()); // For sub-element accesses, try to subdivide the vector into smaller // elements. @@ -1431,7 +1342,7 @@ class BytePackFolder { auto *NewTy = FixedVectorType::get(TargetIntTy, VecLayout.NumVecElements * SplitFactor); V = pushCast(Instruction::BitCast, V, NewTy); - VecLayout = getByteLayout(V->getType()); + VecLayout = *getByteLayout(V->getType()); } // Give up if bytes are obtained from a strange offset. @@ -1526,7 +1437,7 @@ class BytePackFolder { auto *TargetVecTy = cast(TargetInst->getType()); Type *TargetEltTy = TargetVecTy->getElementType(); - const ByteLayout SrcLayout = getByteLayout(CB.Base->getType()); + const ByteLayout SrcLayout = *getByteLayout(CB.Base->getType()); Value *V = CB.Base; const StringRef &Name = V->getName(); @@ -1602,7 +1513,7 @@ class BytePackFolder { public: BytePackFolder(Instruction *TargetV) - : TargetInst(TargetV), Layout(getByteLayout(TargetV->getType())), + : TargetInst(TargetV), Layout(*getByteLayout(TargetV->getType())), VectorAlignedPack(PartialBytePack::invalid()) {} ~BytePackFolder() { @@ -1612,7 +1523,7 @@ class BytePackFolder { LLVM_DEBUG({ dbgs() << "PICP ["; TargetInst->printAsOperand(dbgs()); - dbgs() << "]: Dequeuing cast " << *I << "\n"; + dbgs() << "]: Dequeuing cast " << *I << '\n'; }); I->replaceAllUsesWith(PoisonValue::get(I->getType())); I->deleteValue(); @@ -1622,7 +1533,7 @@ class BytePackFolder { LLVM_DEBUG({ dbgs() << "PICP ["; TargetInst->printAsOperand(dbgs()); - dbgs() << "]: Dequeuing inst " << *Insts.back() << "\n"; + dbgs() << "]: Dequeuing inst " << *Insts.back() << '\n'; }); Insts.back()->deleteValue(); Insts.pop_back(); @@ -1632,15 +1543,17 @@ class BytePackFolder { /// Try to generate instructions for coalescing the given bytes and aligning /// them to the target value. Returns true iff this is successful. bool pushCoalescedBytes(CoalescedBytes CB) { - if (isa(CB.Base) && CB.SignedShrByteOffset == 0) { - WorkList.emplace_back(CB.Base, CB.Mask); - return true; - } + if (CB.SignedShrByteOffset == 0) + if (auto *Const = dyn_cast(CB.Base)) { + WorkList.emplace_back( + ConstantExpr::getBitCast(Const, TargetInst->getType()), CB.Mask); + return true; + } LLVM_DEBUG({ dbgs() << "PICP ["; TargetInst->printAsOperand(dbgs()); - dbgs() << "]: Preparing bytes " << CB << "\n"; + dbgs() << "]: Preparing bytes " << CB << '\n'; }); if (isa(TargetInst->getType())) { if (isa(CB.Base->getType())) @@ -1735,8 +1648,9 @@ struct PackedIntInstruction { /// If the rewriter is non-aggressive, return nullopt if the rewriting is /// determined to be unnecessary. static std::optional> -getCoalescingOpportunity(Type *Ty, const ByteVector &BV) { - const ByteLayout Layout = getByteLayout(Ty); +getCoalescingOpportunity(Type *Ty, const ByteVector &BV, + bool AggressiveRewriting) { + const ByteLayout Layout = *getByteLayout(Ty); assert(Layout.getNumBytes() == BV.size() && "Byte definition has unexpected width."); @@ -1761,8 +1675,8 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) { } else { CoalescedBytes *CB = nullptr; Value *Base = B.getBase(); - const signed Offset = - static_cast(B.getIndex()) - static_cast(ByteIdx); + const int Offset = + static_cast(B.getIndex()) - static_cast(ByteIdx); for (unsigned CBIdx = 0; CBIdx < CBV.size(); ++CBIdx) { if (CBV[CBIdx].alignsWith(Base, Offset)) { CB = &CBV[CBIdx]; @@ -1773,7 +1687,7 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) { LLVM_DEBUG(dbgs() << "PICP: Bytes " << *CB << " from operand " << OpIdx << " can be coalesced with byte " << B - << " from operand " << BU.getOperandIndex() << "\n"); + << " from operand " << BU.getOperandIndex() << '\n'); OperandsAlreadyCoalesced = false; } } @@ -1831,7 +1745,8 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) { /// Queue into \p PIIV the set of final values (or operands thereof, if the /// rewriter is non-aggressive) which are deemed beneficial to rewrite. static void queueRewriting(std::vector &PIIV, - Instruction &FinalInst, ByteExpander &BE) { + Instruction &FinalInst, ByteExpander &BE, + bool AggressiveRewriting) { SmallVector WorkList{&FinalInst}; SmallPtrSet Seen{&FinalInst}; @@ -1844,15 +1759,15 @@ static void queueRewriting(std::vector &PIIV, // This instruction is beyond the analysis scope of PICP. continue; - LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << "\n" + LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << '\n' << " byte pack: " << BE.getByteDefinition(I) - << "\n"); + << '\n'); auto CBV = [&]() -> std::optional> { // Short-circuit check for casts. if (!AggressiveRewriting && I->getNumOperands() == 1) return std::nullopt; - return getCoalescingOpportunity(I->getType(), *BV); + return getCoalescingOpportunity(I->getType(), *BV, AggressiveRewriting); }(); if (!CBV) { @@ -1868,19 +1783,20 @@ static void queueRewriting(std::vector &PIIV, } while (!WorkList.empty()); } -static bool runImpl(Function &F) { +static bool runImpl(Function &F, PackedIntegerCombineOptions Options) { ByteExpander BE; - std::vector PIICandidates = BE.collectPIICandidates(F); + std::vector PIICandidates = + BE.collectPIICandidates(F, Options.MaxCollectionIterations); std::vector PIIV; for (Instruction *I : PIICandidates) { if (!BE.checkIfIntermediate(I)) - queueRewriting(PIIV, *I, BE); + queueRewriting(PIIV, *I, BE, Options.AggressiveRewriting); else - LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << "\n" + LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << '\n' << " final user: " - << **BE.getFinalUsers(I).begin() << "\n"); + << **BE.getFinalUsers(I).begin() << '\n'); } DenseMap InstSubs; @@ -1888,7 +1804,7 @@ static bool runImpl(Function &F) { for (const PackedIntInstruction &PII : PIIV) if (Value *V = PII.rewrite(IRB)) { LLVM_DEBUG(dbgs() << "PICP rewrite successful for " << *PII.TargetInst - << "\n"); + << '\n'); InstSubs[PII.TargetInst] = V; } @@ -1907,12 +1823,15 @@ static bool runImpl(Function &F) { } class PackedIntegerCombineLegacyPass : public FunctionPass { + PackedIntegerCombineOptions Options; + public: static char ID; - PackedIntegerCombineLegacyPass() : FunctionPass(ID) {} + PackedIntegerCombineLegacyPass(PackedIntegerCombineOptions Options) + : FunctionPass(ID), Options(Options) {} - bool runOnFunction(Function &F) override { return runImpl(F); } + bool runOnFunction(Function &F) override { return runImpl(F, Options); } }; char PackedIntegerCombineLegacyPass::ID = 0; @@ -1920,7 +1839,7 @@ char PackedIntegerCombineLegacyPass::ID = 0; PreservedAnalyses PackedIntegerCombinePass::run(Function &F, FunctionAnalysisManager &AM) { - if (!runImpl(F)) + if (!runImpl(F, Options)) return PreservedAnalyses::all(); PreservedAnalyses PA; @@ -1928,9 +1847,22 @@ PreservedAnalyses PackedIntegerCombinePass::run(Function &F, return PA; } +void PackedIntegerCombinePass::printPipeline( + raw_ostream &OS, function_ref MapClassName2PassName) { + static_cast *>(this)->printPipeline( + OS, MapClassName2PassName); + OS << '<'; + if (Options.AggressiveRewriting) + OS << "aggressive;"; + OS << "max-iterations=" << Options.MaxCollectionIterations << '>'; +} + INITIALIZE_PASS(PackedIntegerCombineLegacyPass, DEBUG_TYPE, "Packed Integer Combine", false, false) -FunctionPass *llvm::createPackedIntegerCombinePass() { - return new PackedIntegerCombineLegacyPass(); +FunctionPass * +llvm::createPackedIntegerCombinePass(unsigned MaxCollectionIterations, + bool AggressiveRewriting) { + return new PackedIntegerCombineLegacyPass( + {MaxCollectionIterations, AggressiveRewriting}); } diff --git a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll index e7b6a6bc66fa1..c3804eeff6891 100644 --- a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll +++ b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll @@ -1,53 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY -; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE - -;; u0xff00ff00 = -16711936 -;; u0x00ff00ff = 16711935 -define i32 @add.0(i32 %a, i32 %b) { -; LAZY-LABEL: define i32 @add.0( -; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711936 -; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 -; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]] -; LAZY-NEXT: ret i32 [[ADD]] -; -; AGGRESSIVE-LABEL: define i32 @add.0( -; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; AGGRESSIVE-NEXT: [[B_MASK2:%.*]] = and i32 [[B]], 16711935 -; AGGRESSIVE-NEXT: [[A_MASK4:%.*]] = and i32 [[A]], -16711936 -; AGGRESSIVE-NEXT: [[ADD_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]] -; AGGRESSIVE-NEXT: ret i32 [[ADD_MERGE]] -; - %a.mask = and i32 %a, u0xff00ff00 - %b.mask = and i32 %b, u0x00ff00ff - %add = add i32 %a.mask, %b.mask - ret i32 %add -} - -;; u0xff00ffff = -16711681 -;; u0x00ff00ff = 16711935 -;; Nothing happens in this case because of the overlapping bytes. -define i32 @add.1(i32 %a, i32 %b) { -; LAZY-LABEL: define i32 @add.1( -; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; LAZY-NEXT: [[A_MASK:%.*]] = and i32 [[A]], -16711681 -; LAZY-NEXT: [[B_MASK:%.*]] = and i32 [[B]], 16711935 -; LAZY-NEXT: [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]] -; LAZY-NEXT: ret i32 [[ADD]] -; -; AGGRESSIVE-LABEL: define i32 @add.1( -; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; AGGRESSIVE-NEXT: [[A_MASK2:%.*]] = and i32 [[A]], -16711681 -; AGGRESSIVE-NEXT: [[B_MASK4:%.*]] = and i32 [[B]], 16711935 -; AGGRESSIVE-NEXT: [[ADD:%.*]] = add i32 [[A_MASK2]], [[B_MASK4]] -; AGGRESSIVE-NEXT: ret i32 [[ADD]] -; - %a.mask = and i32 %a, u0xff00ffff - %b.mask = and i32 %b, u0x00ff00ff - %add = add i32 %a.mask, %b.mask - ret i32 %add -} +; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes="packed-integer-combine" %s | FileCheck %s --check-prefix=AGGRESSIVE define i32 @and.0(i32 %a, i32 %b) { ; LAZY-LABEL: define i32 @and.0( diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll index fe08ce93719d0..274d01f4ad70f 100644 --- a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll +++ b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY -; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes="packed-integer-combine" %s | FileCheck %s --check-prefix=AGGRESSIVE define i16 @top_bytes(i32 %a, i32 %b) { ; LAZY-LABEL: define i16 @top_bytes( diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll index e4c1538826e0f..ad4424a84a6cc 100644 --- a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll +++ b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY -; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes="packed-integer-combine" %s | FileCheck %s --check-prefix=AGGRESSIVE define <2 x i8> @top_bytes(i32 %a, i32 %b) { ; LAZY-LABEL: define <2 x i8> @top_bytes( diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll index 22ee7fcf5b4c6..624f2b4f8acac 100644 --- a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll +++ b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY -; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes="packed-integer-combine" %s | FileCheck %s --check-prefix=AGGRESSIVE define i32 @extract_i32(<4 x i8> %from) { ; LAZY-LABEL: define i32 @extract_i32( diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll index 5baefc7fb6cda..b6dbaa546c718 100644 --- a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll +++ b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY -; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY +; RUN: opt -S -passes="packed-integer-combine" %s | FileCheck %s --check-prefix=AGGRESSIVE define <4 x i8> @obtain_v4i8(<2 x i16> %from) { ; LAZY-LABEL: define <4 x i8> @obtain_v4i8(