From 652e7a5fb486c992c90bf0bdfcadfb12fb41b72c Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe@amd.com>
Date: Mon, 30 Jun 2025 09:24:53 -0500
Subject: [PATCH 1/2] Squashing wip branch.

---
 llvm/include/llvm/InitializePasses.h          |    1 +
 llvm/include/llvm/Transforms/Scalar.h         |    7 +
 .../Scalar/PackedIntegerCombinePass.h         |   32 +
 llvm/lib/Passes/PassBuilder.cpp               |    1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |    7 +
 llvm/lib/Passes/PassRegistry.def              |    1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   11 +-
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |    1 +
 .../Scalar/PackedIntegerCombinePass.cpp       | 1936 +++++++++++++++++
 llvm/lib/Transforms/Scalar/Scalar.cpp         |    1 +
 .../CodeGen/AMDGPU/combine-vload-extract.ll   |    6 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |    3 +
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  105 +-
 .../AMDGPU/splitkit-getsubrangeformask.ll     |  168 +-
 llvm/test/Other/new-pm-defaults.ll            |    1 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |    1 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |    1 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |    1 +
 .../Other/new-pm-thinlto-prelink-defaults.ll  |    1 +
 .../new-pm-thinlto-prelink-pgo-defaults.ll    |    1 +
 ...w-pm-thinlto-prelink-samplepgo-defaults.ll |    1 +
 .../PackedIntegerCombine/instructions.ll      |  601 +++++
 .../PackedIntegerCombine/int2int.ll           |  302 +++
 .../PackedIntegerCombine/int2vec.ll           |  393 ++++
 .../PackedIntegerCombine/vec2int.ll           |  480 ++++
 .../PackedIntegerCombine/vec2vec.ll           |  294 +++
 26 files changed, 4173 insertions(+), 184 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
 create mode 100644 llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
 create mode 100644 llvm/test/Transforms/PackedIntegerCombine/instructions.ll
 create mode 100644 llvm/test/Transforms/PackedIntegerCombine/int2int.ll
 create mode 100644 llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
 create mode 100644 llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
 create mode 100644 llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 1c4ed3843b390..7e934e635c063 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -237,6 +237,7 @@ initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeOptimizePHIsLegacyPass(PassRegistry &);
 LLVM_ABI void initializePEILegacyPass(PassRegistry &);
 LLVM_ABI void initializePHIEliminationPass(PassRegistry &);
+LLVM_ABI void initializePackedIntegerCombineLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializePatchableFunctionLegacyPass(PassRegistry &);
 LLVM_ABI void initializePeepholeOptimizerLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index 1398f171b0f78..ec9d89507c375 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -154,6 +154,13 @@ LLVM_ABI FunctionPass *
 createInferAddressSpacesPass(unsigned AddressSpace = ~0u);
 LLVM_ABI extern char &InferAddressSpacesID;
 
+//===----------------------------------------------------------------------===//
+//
+// PackedIntegerCombinePass - Tracks individual bytes through instructions to
+// systematically identify redundant byte packing or unpacking operations.
+//
+LLVM_ABI FunctionPass *createPackedIntegerCombinePass();
+
 //===----------------------------------------------------------------------===//
 //
 // PartiallyInlineLibCalls - Tries to inline the fast path of library
diff --git a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
new file mode 100644
index 0000000000000..a5916e2e611cf
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
@@ -0,0 +1,32 @@
+//===- PackedIntegerCombinePass.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the interface for LLVM's Packed Integer Combine pass.
+/// This pass tries to treat integers as packed chunks of individual bytes,
+/// and leverage this to coalesce needlessly fragmented
+/// computations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H
+#define LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class PackedIntegerCombinePass
+    : public PassInfoMixin<PackedIntegerCombinePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_PACKEDINTCOMBINE_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0697a0a6b4c74..7a382ace34dbc 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -313,6 +313,7 @@
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/NaryReassociate.h"
 #include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
 #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
 #include "llvm/Transforms/Scalar/PlaceSafepoints.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index c83d2dc1f1514..2da72606bc47a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -121,6 +121,7 @@
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Scalar/SROA.h"
@@ -542,6 +543,9 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   // opportunities that creates).
   FPM.addPass(BDCEPass());
 
+  // Simplify bit-packed operations before cleaning up with instcombine.
+  FPM.addPass(PackedIntegerCombinePass());
+
   // Run instcombine after redundancy and dead bit elimination to exploit
   // opportunities opened up by them.
   FPM.addPass(InstCombinePass());
@@ -743,6 +747,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // opportunities that creates).
   FPM.addPass(BDCEPass());
 
+  // Simplify bit-packed operations before cleaning up with instcombine.
+  FPM.addPass(PackedIntegerCombinePass());
+
   // Run instcombine after redundancy and dead bit elimination to exploit
   // opportunities opened up by them.
   FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 65276489e6f02..6f1c405a5efa7 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -476,6 +476,7 @@ FUNCTION_PASS("objc-arc", ObjCARCOptPass())
 FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass())
 FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass())
 FUNCTION_PASS("pa-eval", PAEvalPass())
+FUNCTION_PASS("packedintcombine", PackedIntegerCombinePass())
 FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
 FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("place-safepoints", PlaceSafepointsPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c3536113e9bef..8f0ef348fe778 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -104,6 +104,7 @@
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
 #include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
 #include "llvm/Transforms/Scalar/Sink.h"
 #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
@@ -1378,8 +1379,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
 
   TargetPassConfig::addCodeGenPrepare();
 
-  if (isPassEnabled(EnableLoadStoreVectorizer))
+  if (isPassEnabled(EnableLoadStoreVectorizer)) {
     addPass(createLoadStoreVectorizerPass());
+    // LSV pass opens up more opportunities for packed integer combining.
+    addPass(createPackedIntegerCombinePass());
+  }
 
   // LowerSwitch pass may introduce unreachable blocks that can
   // cause unexpected behavior for subsequent passes. Placing it
@@ -2101,8 +2105,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
 
   Base::addCodeGenPrepare(addPass);
 
-  if (isPassEnabled(EnableLoadStoreVectorizer))
+  if (isPassEnabled(EnableLoadStoreVectorizer)) {
     addPass(LoadStoreVectorizerPass());
+    // LSV pass opens up more opportunities for packed integer combining.
+    addPass(PackedIntegerCombinePass());
+  }
 
   // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
   // behavior for subsequent passes. Placing it here seems better that these
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02043d01..d45a3785f9f8f 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -61,6 +61,7 @@ add_llvm_component_library(LLVMScalarOpts
   NaryReassociate.cpp
   NewGVN.cpp
   PartiallyInlineLibCalls.cpp
+  PackedIntegerCombinePass.cpp
   PlaceSafepoints.cpp
   Reassociate.cpp
   Reg2Mem.cpp
diff --git a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
new file mode 100644
index 0000000000000..31edd28069a2b
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
@@ -0,0 +1,1936 @@
+//===- PackedIntegerCombinePass.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file provides the interface for LLVM's Packed Integer Combine pass.
+/// This pass tries to treat integers as packed chunks of individual bytes,
+/// and leverage this to coalesce needlessly fragmented
+/// computations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/PackedIntegerCombinePass.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packedintcombine"
+
+static cl::opt<unsigned> MaxCollectionIterations(
+    "packedint-max-iterations",
+    cl::desc("Maximum number of iterations to isolate final packed "
+             "instructions. Set to 0 to iterate until convergence."),
+    cl::init(2), cl::Hidden);
+
+static cl::opt<bool>
+    AggressiveRewriting("packedint-aggressive-rewriter",
+                        cl::desc("Aggressively rewrite packed instructions."),
+                        cl::init(false), cl::Hidden);
+
+namespace {
+
+/// Reference to either a constant byte, or a byte extracted from an IR value.
+class Byte {
+  /// The base value from which the byte is obtained.
+  Value *Base;
+
+  /// If the base value is not null, then this holds the index of the byte
+  /// being used, where 0 is the least significant byte.
+  /// Otherwise, this is treated as a constant byte.
+  unsigned Integer;
+
+public:
+  static constexpr unsigned BitWidth = 8;
+  static constexpr unsigned AllOnes = 0xff;
+
+  /// Construct a byte from a well-defined IR value.
+  explicit Byte(Value &Base, unsigned Index) : Base(&Base), Integer(Index) {}
+
+  /// Construct a constant byte.
+  explicit Byte(unsigned Constant) : Base(nullptr), Integer(Constant) {
+    assert(Constant <= AllOnes && "Constant is too large to fit in a byte.");
+  }
+
+  /// Construct a constant byte that is fully set.
+  static Byte ones() { return Byte(Byte::AllOnes); }
+  /// Construct the zero byte.
+  static Byte zeroes() { return Byte(0); }
+
+  /// Indicate whether the byte is a known integer constant.
+  /// Note that poison or undef base values are not recognised as constant.
+  bool isConstant() const { return !Base; }
+
+  /// Get the constant byte value.
+  unsigned getConstant() const {
+    assert(isConstant() && "Expected a constant byte.");
+    return Integer;
+  }
+
+  /// Get the base IR value from which this byte is obtained.
+  Value *getBase() const {
+    assert(!isConstant() && "Byte constants do not have a base value.");
+    return Base;
+  }
+
+  /// Get the byte offset of the IR value referenced by the byte.
+  unsigned getIndex() const {
+    assert(!isConstant() && "Byte constants are not indexed.");
+    return Integer;
+  }
+
+  bool operator==(const Byte &Other) const {
+    return Base == Other.Base && Integer == Other.Integer;
+  }
+
+  void print(raw_ostream &ROS, bool NewLine = true) const {
+    if (isConstant())
+      ROS << "const";
+    else
+      Base->printAsOperand(ROS, false);
+
+    ROS << "[" << Integer << "]";
+
+    if (NewLine)
+      ROS << "\n";
+  }
+
+  LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
+};
+
+inline raw_ostream &operator<<(raw_ostream &ROS, const Byte &B) {
+  B.print(ROS, false);
+  return ROS;
+}
+
+/// Convenience data structure for describing the layout of bytes for vector and
+/// integer types, treating integer types as singleton vectors.
+struct ByteLayout {
+  /// The number of bytes that fit in a single element.
+  unsigned NumBytesPerElement;
+  /// The number of vector elements (or 1, if the type is an integer type).
+  unsigned NumVecElements;
+
+  /// Get the total number of bytes held by the vector or integer type.
+  unsigned getNumBytes() const { return NumBytesPerElement * NumVecElements; }
+};
+
+/// Interpret the given type as a number of packed bytes, if possible.
+static std::optional<ByteLayout> tryGetByteLayout(const Type *Ty) {
+  unsigned IntBitWidth, NumElts;
+  if (const auto *IntTy = dyn_cast<IntegerType>(Ty)) {
+    IntBitWidth = IntTy->getBitWidth();
+    NumElts = 1;
+  } else if (const auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    const auto *IntTy = dyn_cast<IntegerType>(VecTy->getElementType());
+    if (!IntTy)
+      return std::nullopt;
+    IntBitWidth = IntTy->getBitWidth();
+    NumElts = VecTy->getNumElements();
+  } else
+    return std::nullopt;
+
+  if (IntBitWidth % Byte::BitWidth != 0)
+    return std::nullopt;
+
+  return ByteLayout{IntBitWidth / Byte::BitWidth, NumElts};
+}
+
+/// Interpret the given type as a number of backed bytes (aborts if impossible).
+static ByteLayout getByteLayout(const Type *Ty) {
+  const std::optional<ByteLayout> Layout = tryGetByteLayout(Ty);
+  assert(Layout);
+  return *Layout;
+}
+
+/// A convenience class for combining Byte instances obtained from the same base
+/// value, and with a common relative offset, which can hence be obtained
+/// simultaneously.
+struct CoalescedBytes {
+  /// The value from which the coalesced bytes are all derived. This pointer is
+  /// never null.
+  Value *Base;
+  /// The number of bytes to shift right to align the coalesced bytes with the
+  /// target value.
+  ///
+  /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide
+  /// bytes 0, 1, 2 of the target %tgt, then ShrByteOffset = 3.
+  signed SignedShrByteOffset;
+  /// The bitmask identifying which bytes of the target value are covered by
+  /// these coalesced bytes.
+  ///
+  /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide
+  /// bytes 0, 1, 2 of the target %tgt, then this mask's first three bits will
+  /// be set, corresponding to the first three bits of %tgt.
+  SmallBitVector Mask;
+
+  explicit CoalescedBytes(Value &Base, signed Offset, SmallBitVector Mask)
+      : Base(&Base), SignedShrByteOffset(Offset), Mask(Mask) {}
+  explicit CoalescedBytes(Value &Base, signed Offset, unsigned NumBytes)
+      : Base(&Base), SignedShrByteOffset(Offset), Mask(NumBytes) {}
+
+  bool alignsWith(Value *V, signed VOffset) const {
+    return Base == V && SignedShrByteOffset == VOffset;
+  }
+
+  /// Get the number of bytes to shift the base value right to align with the
+  /// target value.
+  unsigned getShrBytes() const { return std::max(0, SignedShrByteOffset); }
+
+  /// Get the number of bytes to shift the base value left to align with the
+  /// target value.
+  unsigned getShlBytes() const { return std::max(0, -SignedShrByteOffset); }
+
+  /// Get the number of bits to shift the base value right to align with the
+  /// target value.
+  unsigned getShrBits() const { return getShrBytes() * Byte::BitWidth; }
+
+  /// Get the number of bits to shift the base value left to align with the
+  /// target value.
+  unsigned getShlBits() const { return getShlBytes() * Byte::BitWidth; }
+
+  void print(raw_ostream &ROS, bool NewLine = true) const {
+    ROS << "{ ";
+    for (unsigned Idx = 0; Idx < Mask.size(); ++Idx) {
+      if (Mask.test(Idx)) {
+        Base->printAsOperand(ROS, false);
+        ROS << "[" << (static_cast<int>(Idx) + SignedShrByteOffset) << "]";
+      } else
+        ROS << 0;
+
+      ROS << "; ";
+    }
+    ROS << "}";
+
+    if (NewLine)
+      ROS << "\n";
+  }
+
+  LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
+};
+
+inline raw_ostream &operator<<(raw_ostream &ROS, const CoalescedBytes &CB) {
+  CB.print(ROS, false);
+  return ROS;
+}
+
+/// Association of a Byte (constant or byte extracted from an LLVM Value) to the
+/// operand(s) responsible for producing it. A value of ByteUse::AllOperands
+/// (-1) indicates that all operands are responsible for producing the given
+/// byte.
+class ByteUse {
+
+  Byte B;
+  int OpIdx;
+
+public:
+  /// Sentinel value representing that all operands are responsible for the
+  /// given Byte.
+  static constexpr int AllOperands = -1;
+
+  ByteUse(Byte B, int OpIdx) : B(B), OpIdx(OpIdx) {}
+
+  const Byte &getByte() const { return B; }
+  int getOperandIndex() const { return OpIdx; }
+
+  bool operator==(const ByteUse &BU) const {
+    return BU.B == B && BU.OpIdx == OpIdx;
+  }
+};
+
+using ByteVector = SmallVector<ByteUse, 8>;
+
+/// The decomposition of an IR value into its individual bytes, tracking where
+/// each byte is obtained.
+class ByteDefinition {
+  /// Enum classifying what Ptr points to.
+  enum ByteType : uint8_t {
+    /// Ptr's value is undefined.
+    INVALID,
+    /// The byte definition is given by a ByteVector, which is referenced (but
+    /// not captured) by Ptr.
+    VECTOR,
+    /// The bytes are obtained from a (currently opaque) IR value, held by Ptr.
+    VALUE,
+    /// The bytes are obtained from a constant integer, held by Ptr.
+    CONST_INT,
+    /// The bytes are obtained from a constant vector of integers, held by Ptr.
+    CONST_VEC,
+  };
+
+  ByteType DefType;
+  void *Ptr;
+  ByteLayout Layout;
+  ByteDefinition(ByteType DefType, void *Ptr, ByteLayout Layout)
+      : DefType(DefType), Ptr(Ptr), Layout(Layout) {}
+
+public:
+  /// Indicate that a value cannot be decomposed into bytes in a known way.
+  static ByteDefinition invalid() { return {INVALID, nullptr, {0, 0}}; }
+  /// Indicate that a value's bytes are known, and track their producers.
+  static ByteDefinition vector(ByteVector &Ref, ByteLayout Layout) {
+    return {VECTOR, &Ref, Layout};
+  }
+  /// Indicate that a value's bytes are opaque.
+  static ByteDefinition value(Value &V) {
+    return {VALUE, &V, getByteLayout(V.getType())};
+  }
+  /// Indicate that the bytes come from a constant integer.
+  static ByteDefinition constInt(ConstantInt &Int) {
+    return {CONST_INT, &Int, getByteLayout(Int.getType())};
+  }
+  /// Indicate that the bytes come from a constant vector of integers.
+  static ByteDefinition constVec(Constant &Vec) {
+    assert(Vec.getType()->isVectorTy());
+    return {CONST_VEC, &Vec, getByteLayout(Vec.getType())};
+  }
+
+  ByteVector &getVector() const {
+    assert(DefType == VECTOR);
+    return *static_cast<ByteVector *>(Ptr);
+  }
+  Value &getValue() const {
+    assert(DefType == VALUE);
+    return *static_cast<Value *>(Ptr);
+  }
+  ConstantInt &getConstInt() const {
+    assert(DefType == CONST_INT);
+    return *static_cast<ConstantInt *>(Ptr);
+  }
+  Constant &getConstVec() const {
+    assert(DefType == CONST_VEC);
+    return *static_cast<Constant *>(Ptr);
+  }
+
+  bool isValid() const { return DefType != INVALID; }
+
+  /// Return true iff the byte definition is valid.
+  operator bool() const { return isValid(); }
+
+  /// Get the definition of the byte at the specified byte offset, where 0 is
+  /// the least significant byte.
+  Byte getByte(unsigned Idx) const {
+    switch (DefType) {
+    default:
+      llvm_unreachable("Invalid byte definition");
+    case VECTOR:
+      return getVector()[Idx].getByte();
+    case VALUE:
+      return Byte(getValue(), Idx);
+    case CONST_INT:
+      return Byte(getConstInt().getValue().extractBitsAsZExtValue(
+          Byte::BitWidth, Idx * Byte::BitWidth));
+    case CONST_VEC: {
+      const auto &Vec = getConstVec();
+      const ByteLayout Layout = getByteLayout(Vec.getType());
+      const unsigned VecIdx = Idx / Layout.NumBytesPerElement;
+      const unsigned EltIdx = Idx % Layout.NumBytesPerElement;
+
+      Constant *Elt = Vec.getAggregateElement(VecIdx);
+      if (const auto *Int = dyn_cast<ConstantInt>(Elt))
+        return Byte(Int->getValue().extractBitsAsZExtValue(
+            Byte::BitWidth, EltIdx * Byte::BitWidth));
+
+      return Byte(*Elt, EltIdx);
+    }
+    }
+  }
+
+  const ByteLayout &getLayout() const { return Layout; }
+
+  void print(raw_ostream &ROS, bool NewLine = true) const {
+    switch (DefType) {
+    default:
+      ROS << "[INVALID]";
+      break;
+    case VECTOR: {
+      ByteVector &BV = getVector();
+      ROS << "{ ";
+      for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx)
+        ROS << ByteIdx << ": " << BV[ByteIdx].getByte() << "; ";
+      ROS << "}";
+      break;
+    }
+    case VALUE:
+      ROS << "(";
+      getValue().printAsOperand(ROS);
+      ROS << ")[0:" << Layout.getNumBytes() << "]";
+      break;
+    case CONST_INT:
+      ROS << getConstInt();
+      break;
+    case CONST_VEC:
+      ROS << getConstVec();
+      break;
+    }
+
+    if (NewLine)
+      ROS << "\n";
+  }
+
+  LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
+};
+
+inline raw_ostream &operator<<(raw_ostream &ROS, const ByteDefinition &Def) {
+  Def.print(ROS, false);
+  return ROS;
+}
+
+/// Tries to update byte definitions using the provided instruction.
+///
+/// In order to avoid eliminating values which are required for multiple packed
+/// integers, the ByteExpander distinguishes two types of packed integer values:
+/// - "Final" values, which are packed bytes which are either used by
+///   instructions that cannot be classified as packed byte operations, or
+///   values which are used by several other "final" values.
+/// - "Intermediate" values, which are values whose sole raisons d'etre are to
+///   produce bytes for a unique final value.
+///
+/// Effectively, intermediate values may be eliminated or replaced freely,
+/// whereas final values must remain present in the IR after the pass completes.
+/// Accordingly, byte defniitions of final values are expanded only up to other
+/// final value producers.
+class ByteExpander final : public InstVisitor<ByteExpander, ByteVector> {
+  /// Resolution of values to their known definitions.
+  DenseMap<Value *, ByteVector> Definitions;
+  /// Map to all (eventual) non-intermediate users of a value.
+  DenseMap<Value *, DenseSet<Value *>> FinalUsers;
+
+  void updateFinalUsers(Value *V);
+  bool checkIfIntermediate(Value *V, bool IsOperand);
+
+public:
+  // Visitation implementations return `true` iff a new byte definition was
+  // successfully constructed.
+
+  ByteVector visitAdd(BinaryOperator &I);
+  ByteVector visitAnd(BinaryOperator &I);
+  ByteVector visitOr(BinaryOperator &I);
+  ByteVector visitXor(BinaryOperator &I);
+  ByteVector visitShl(BinaryOperator &I);
+  ByteVector visitLShr(BinaryOperator &I);
+  ByteVector visitTruncInst(TruncInst &I);
+  ByteVector visitZExtInst(ZExtInst &I);
+  ByteVector visitBitCastInst(BitCastInst &I);
+  ByteVector visitExtractElementInst(ExtractElementInst &I);
+  ByteVector visitInsertElementInst(InsertElementInst &I);
+  ByteVector visitShuffleVectorInst(ShuffleVectorInst &I);
+  // fallback for unhandled instructions
+  ByteVector visitInstruction(Instruction &I) { return {}; }
+
+  /// Return the final values producing each byte of a value, if known, or
+  /// otherwise return a nullptr.
+  ByteVector *expandByteDefinition(Value *V);
+
+  /// Decompose a value into its bytes. If \p ExpandDef is true, expand each
+  /// byte to the final values producing them if possible. The return value is
+  /// guaranteed to be valid so long as the value passed can be viewed as packed
+  /// bytes.
+  ByteDefinition getByteDefinition(Value *V, bool ExpandDef = true);
+
+  /// Same as above, but only expand bytes to their final value producers if the
+  /// value \p V in question is an intermediate value. This is provided as a
+  /// convenience for instruction visitation, as definitions should only expand
+  /// until final value producers, even if the final value producers' bytes can
+  /// be expanded further.
+  ByteDefinition getByteDefinitionIfIntermediateOperand(Value *V);
+
+  /// Get the set of all final values which use \p V.
+  const DenseSet<Value *> &getFinalUsers(Value *V);
+
+  /// Check if the provided value is known to be an intermediate value.
+  bool checkIfIntermediate(Value *V) { return checkIfIntermediate(V, false); }
+
+  /// Iterate over all instructions in a function over several passes to
+  /// identify all final values and their byte definitions.
+  std::vector<Instruction *> collectPIICandidates(Function &F);
+};
+
+ByteVector ByteExpander::visitAdd(BinaryOperator &I) {
+  const ByteDefinition LhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  const ByteDefinition RhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+  if (!LhsDef || !RhsDef)
+    return {};
+
+  const ByteLayout &Layout = LhsDef.getLayout();
+  const unsigned NumBytes = Layout.getNumBytes();
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+    const Byte Lhs = LhsDef.getByte(ByteIdx);
+    const Byte Rhs = RhsDef.getByte(ByteIdx);
+
+    const bool LhsIsZero = Lhs.isConstant() && Lhs.getConstant() == 0;
+    const bool RhsIsZero = Rhs.isConstant() && Rhs.getConstant() == 0;
+    if (LhsIsZero)
+      BV.emplace_back(Rhs, RhsIsZero ? ByteUse::AllOperands : 1);
+    else if (RhsIsZero)
+      BV.emplace_back(Lhs, 0);
+    else
+      return {};
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
+  const ByteDefinition LhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  const ByteDefinition RhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+  if (!LhsDef || !RhsDef)
+    return {};
+
+  const ByteLayout &Layout = LhsDef.getLayout();
+  const unsigned NumBytes = Layout.getNumBytes();
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+    const Byte Lhs = LhsDef.getByte(ByteIdx);
+    const Byte Rhs = RhsDef.getByte(ByteIdx);
+
+    if (Lhs == Rhs) {
+      BV.emplace_back(Lhs, ByteUse::AllOperands);
+      continue;
+    }
+
+    if (Lhs.isConstant()) {
+      if (Lhs.getConstant() == 0) {
+        BV.emplace_back(Byte::zeroes(), 0);
+        continue;
+      }
+      if (Lhs.getConstant() == Byte::AllOnes) {
+        BV.emplace_back(Rhs, 1);
+        continue;
+      }
+    }
+    if (Rhs.isConstant()) {
+      if (Rhs.getConstant() == 0) {
+        BV.emplace_back(Byte::zeroes(), 1);
+        continue;
+      }
+      if (Rhs.getConstant() == Byte::AllOnes) {
+        BV.emplace_back(Lhs, 0);
+        continue;
+      }
+    }
+
+    if (Lhs == Rhs) {
+      BV.emplace_back(Lhs, ByteUse::AllOperands);
+      continue;
+    }
+    return {};
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitOr(BinaryOperator &I) {
+  const ByteDefinition LhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  const ByteDefinition RhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+  if (!LhsDef || !RhsDef)
+    return {};
+
+  const ByteLayout &Layout = LhsDef.getLayout();
+  const unsigned NumBytes = Layout.getNumBytes();
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+    const Byte Lhs = LhsDef.getByte(ByteIdx);
+    const Byte Rhs = RhsDef.getByte(ByteIdx);
+
+    if (Lhs == Rhs) {
+      BV.emplace_back(Lhs, ByteUse::AllOperands);
+      continue;
+    }
+
+    if (Lhs.isConstant()) {
+      if (Lhs.getConstant() == 0) {
+        BV.emplace_back(Rhs, 1);
+        continue;
+      }
+      if (Lhs.getConstant() == Byte::AllOnes) {
+        BV.emplace_back(Byte::ones(), 0);
+        continue;
+      }
+    }
+
+    if (Rhs.isConstant()) {
+      if (Rhs.getConstant() == 0) {
+        BV.emplace_back(Lhs, 0);
+        continue;
+      }
+      if (Rhs.getConstant() == Byte::AllOnes) {
+        BV.emplace_back(Byte::ones(), 1);
+        continue;
+      }
+    }
+
+    return {};
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitXor(BinaryOperator &I) {
+  const ByteDefinition LhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  const ByteDefinition RhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+  if (!LhsDef || !RhsDef)
+    return {};
+
+  const ByteLayout &Layout = LhsDef.getLayout();
+  const unsigned NumBytes = Layout.getNumBytes();
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
+    const Byte Lhs = LhsDef.getByte(ByteIdx);
+    const Byte Rhs = RhsDef.getByte(ByteIdx);
+    if (Lhs == Rhs)
+      BV.emplace_back(Byte::zeroes(), ByteUse::AllOperands);
+    else if (Lhs.isConstant() && Lhs.getConstant() == 0)
+      BV.emplace_back(Rhs, 1);
+    else if (Rhs.isConstant() && Rhs.getConstant() == 0)
+      BV.emplace_back(Lhs, 0);
+    else
+      return {};
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitShl(BinaryOperator &I) {
+  const ByteDefinition BaseDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!BaseDef)
+    return {};
+
+  const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
+
+  const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+  if (!Const)
+    return {};
+
+  if (isa<ConstantInt>(Const)) {
+    const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
+    if (ShAmt % Byte::BitWidth != 0)
+      return {};
+    const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytes);
+
+    ByteVector BV;
+    BV.reserve(NumBytes);
+    BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+    for (unsigned ByteIdx = 0; ByteIdx + ByteShAmt < NumBytes; ++ByteIdx)
+      BV.emplace_back(BaseDef.getByte(ByteIdx), 0);
+
+    assert(BV.size() == NumBytes);
+    return BV;
+  }
+
+  assert(Const->getType()->isVectorTy());
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  const unsigned NumBytesPerElt = BaseDef.getLayout().NumBytesPerElement;
+  for (unsigned EltIdx = 0; EltIdx < BaseDef.getLayout().NumVecElements;
+       ++EltIdx) {
+    const auto *ConstInt =
+        dyn_cast<ConstantInt>(Const->getAggregateElement(EltIdx));
+    if (!ConstInt)
+      return {};
+    const unsigned ShAmt = ConstInt->getValue().getLimitedValue();
+    if (ShAmt % Byte::BitWidth != 0)
+      return {};
+
+    const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytesPerElt);
+    BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+    for (unsigned ByteIdx = 0; ByteIdx + ByteShAmt < NumBytesPerElt; ++ByteIdx)
+      BV.emplace_back(BaseDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 0);
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
+  const ByteDefinition BaseDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!BaseDef)
+    return {};
+
+  const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
+
+  const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+  if (!Const)
+    return {};
+
+  if (isa<ConstantInt>(Const)) {
+    const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
+    if (ShAmt % Byte::BitWidth != 0)
+      return {};
+    const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytes);
+
+    ByteVector BV;
+    BV.reserve(NumBytes);
+    for (unsigned ByteIdx = ByteShAmt; ByteIdx < NumBytes; ++ByteIdx)
+      BV.emplace_back(BaseDef.getByte(ByteIdx), 0);
+
+    BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+
+    assert(BV.size() == NumBytes);
+    return BV;
+  }
+
+  assert(Const->getType()->isVectorTy());
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  const unsigned NumBytesPerElt = BaseDef.getLayout().NumBytesPerElement;
+
+  for (unsigned EltIdx = 0; EltIdx < BaseDef.getLayout().NumVecElements;
+       ++EltIdx) {
+    const auto *ConstInt =
+        dyn_cast<ConstantInt>(Const->getAggregateElement(EltIdx));
+    if (!ConstInt)
+      return {};
+    const unsigned ShAmt = ConstInt->getValue().getLimitedValue();
+    if (ShAmt % Byte::BitWidth != 0)
+      return {};
+    const unsigned ByteShAmt = std::min(ShAmt / Byte::BitWidth, NumBytesPerElt);
+    for (unsigned ByteIdx = ByteShAmt; ByteIdx < NumBytesPerElt; ++ByteIdx)
+      BV.emplace_back(BaseDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 0);
+
+    BV.append(ByteShAmt, ByteUse(Byte::zeroes(), 0));
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitTruncInst(TruncInst &I) {
+  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  if (!Layout)
+    return {};
+
+  const std::optional<ByteLayout> SrcLayout =
+      tryGetByteLayout(I.getOperand(0)->getType());
+  if (!SrcLayout)
+    return {};
+
+  const ByteDefinition SrcDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!SrcDef)
+    return {};
+
+  ByteVector BV;
+  const unsigned NumBytes = Layout->getNumBytes();
+  BV.reserve(NumBytes);
+
+  const unsigned NumBytesPerElt = Layout->NumBytesPerElement;
+  const unsigned NumSrcBytesPerElt = SrcLayout->NumBytesPerElement;
+  for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx)
+    for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+      BV.emplace_back(SrcDef.getByte(EltIdx * NumSrcBytesPerElt + ByteIdx), 0);
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitZExtInst(ZExtInst &I) {
+  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  if (!Layout)
+    return {};
+
+  const ByteDefinition SrcDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!SrcDef)
+    return {};
+
+  ByteVector BV;
+  const unsigned NumBytes = Layout->getNumBytes();
+  BV.reserve(NumBytes);
+
+  const unsigned NumSrcBytesPerElt = SrcDef.getLayout().NumBytesPerElement;
+  const unsigned NumZExtBytesPerElt =
+      Layout->NumBytesPerElement - NumSrcBytesPerElt;
+
+  unsigned SrcIdx = 0;
+  for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx) {
+    for (unsigned ByteIdx = 0; ByteIdx < NumSrcBytesPerElt; ++SrcIdx, ++ByteIdx)
+      BV.emplace_back(SrcDef.getByte(SrcIdx), 0);
+
+    BV.append(NumZExtBytesPerElt, ByteUse(Byte::zeroes(), 0));
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitBitCastInst(BitCastInst &I) {
+  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  if (!Layout)
+    return {};
+
+  const ByteDefinition SrcDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!SrcDef)
+    return {};
+
+  const unsigned NumBytes = Layout->getNumBytes();
+  ByteVector BV;
+  BV.reserve(NumBytes);
+  for (unsigned Idx = 0; Idx < NumBytes; ++Idx)
+    BV.emplace_back(SrcDef.getByte(Idx), 0);
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitExtractElementInst(ExtractElementInst &I) {
+  const ByteDefinition VecDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!VecDef)
+    return {};
+
+  const auto *VecIdx = dyn_cast<ConstantInt>(I.getOperand(1));
+  if (!VecIdx)
+    return {};
+
+  const unsigned NumBytes = VecDef.getLayout().NumBytesPerElement;
+  const unsigned ByteOffset = VecIdx->getLimitedValue() * NumBytes;
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx)
+    BV.emplace_back(VecDef.getByte(ByteIdx + ByteOffset), 0);
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitInsertElementInst(InsertElementInst &I) {
+  const ByteDefinition VecDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!VecDef)
+    return {};
+
+  const ByteDefinition EltDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+  if (!EltDef)
+    return {};
+
+  const auto *VecIdx = dyn_cast<ConstantInt>(I.getOperand(2));
+  if (!VecIdx)
+    return {};
+
+  const unsigned NumBytes = VecDef.getLayout().getNumBytes();
+  const unsigned NumBytesPerElt = VecDef.getLayout().NumBytesPerElement;
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+  for (unsigned EltIdx = 0; EltIdx < VecDef.getLayout().NumVecElements;
+       ++EltIdx) {
+    if (EltIdx == VecIdx->getLimitedValue()) {
+      for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+        BV.emplace_back(EltDef.getByte(ByteIdx), 0);
+    } else {
+      for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+        BV.emplace_back(VecDef.getByte(EltIdx * NumBytesPerElt + ByteIdx), 1);
+    }
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+ByteVector ByteExpander::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  if (!Layout)
+    return {};
+
+  const ByteDefinition LhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  const ByteDefinition RhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(1));
+  if (!LhsDef || !RhsDef)
+    return {};
+
+  const int LhsSize = LhsDef.getLayout().NumVecElements;
+  const unsigned NumBytes = Layout->getNumBytes();
+  const unsigned NumBytesPerElt = Layout->NumBytesPerElement;
+
+  ByteVector BV;
+  BV.reserve(NumBytes);
+
+  for (unsigned EltIdx = 0; EltIdx < Layout->NumVecElements; ++EltIdx) {
+    const int Idx = I.getMaskValue(EltIdx);
+    if (Idx < 0) {
+      auto *Poison = PoisonValue::get(I.getType()->getElementType());
+      for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+        BV.emplace_back(Byte(*Poison, 0), ByteIdx);
+    } else if (Idx < LhsSize) {
+      for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+        BV.emplace_back(
+            LhsDef.getByte(static_cast<unsigned>(Idx) * NumBytesPerElt +
+                           ByteIdx),
+            0);
+    } else {
+      for (unsigned ByteIdx = 0; ByteIdx < NumBytesPerElt; ++ByteIdx)
+        BV.emplace_back(RhsDef.getByte(static_cast<unsigned>(Idx - LhsSize) *
+                                           NumBytesPerElt +
+                                       ByteIdx),
+                        1);
+    }
+  }
+
+  assert(BV.size() == NumBytes);
+  return BV;
+}
+
+void ByteExpander::updateFinalUsers(Value *V) {
+  assert(!isa<Constant>(V));
+
+  // FIXME: Old users are copied because iterator is potentially invalidated by
+  // intermediacy checks.
+  DenseSet<Value *> OldFinalUsers = getFinalUsers(V);
+
+  DenseSet<Value *> NewFinalUsers;
+  for (Value *User : OldFinalUsers) {
+    if (!Definitions.contains(User) || !checkIfIntermediate(User)) {
+      NewFinalUsers.insert(User);
+      continue;
+    }
+    const DenseSet<Value *> &NestedUses = getFinalUsers(User);
+    NewFinalUsers.insert_range(NestedUses);
+  }
+
+  FinalUsers[V] = std::move(NewFinalUsers);
+}
+
+const DenseSet<Value *> &ByteExpander::getFinalUsers(Value *V) {
+  assert(!isa<Constant>(V));
+
+  auto It = FinalUsers.find(V);
+
+  if (It != FinalUsers.end())
+    return It->getSecond();
+
+  DenseSet<Value *> &Uses = FinalUsers[V];
+  for (Use &U : V->uses())
+    Uses.insert(U.getUser());
+
+  return Uses;
+}
+
+ByteVector *ByteExpander::expandByteDefinition(Value *V) {
+  auto It = Definitions.find(V);
+  if (It == Definitions.end())
+    return nullptr;
+  return &It->getSecond();
+}
+
+ByteDefinition ByteExpander::getByteDefinition(Value *V, bool ExpandDef) {
+  const std::optional<ByteLayout> Layout = tryGetByteLayout(V->getType());
+  if (!Layout)
+    return ByteDefinition::invalid();
+
+  if (auto *ConstInt = dyn_cast<ConstantInt>(V))
+    return ByteDefinition::constInt(*ConstInt);
+  if (auto *Const = dyn_cast<Constant>(V))
+    if (Const->getType()->isVectorTy())
+      return ByteDefinition::constVec(*Const);
+
+  if (ExpandDef)
+    if (ByteVector *BV = expandByteDefinition(V))
+      return ByteDefinition::vector(*BV, *Layout);
+
+  return ByteDefinition::value(*V);
+}
+
+ByteDefinition ByteExpander::getByteDefinitionIfIntermediateOperand(Value *V) {
+  return getByteDefinition(V, checkIfIntermediate(V, true));
+}
+
+bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) {
+  if (isa<Constant>(V))
+    return true;
+
+  /// Short-circuit check.
+  if (IsOperand && V->hasOneUse())
+    return true;
+
+  const DenseSet<Value *> &FU = getFinalUsers(V);
+  if (FU.size() != 1)
+    return false;
+
+  return Definitions.contains(*FU.begin());
+}
+
+std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
+  std::vector<Instruction *> PackedIntInsts;
+  LLVM_DEBUG(dbgs() << "PICP: Entering function " << F.getName() << "\n");
+
+  unsigned NumIterations = 1;
+  for (;;) {
+    LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << "\n");
+    bool Converged = true;
+
+    std::vector<Instruction *> CollectedInsts;
+    SetVector<Value *> WorkList;
+
+    for (BasicBlock *BB : ReversePostOrderTraversal<Function *>(&F)) {
+      for (Instruction &I : *BB) {
+        ByteVector BV = visit(I);
+        if (BV.empty())
+          continue;
+
+        CollectedInsts.push_back(&I);
+
+        ByteVector &Def = Definitions[&I];
+        if (Def == BV)
+          continue;
+
+        Converged = false;
+        Def = std::move(BV);
+        for (ByteUse &BU : Def) {
+          const Byte &B = BU.getByte();
+          if (!B.isConstant() && !isa<Constant>(B.getBase()))
+            WorkList.insert(B.getBase());
+        }
+
+        WorkList.insert(&I);
+
+        LLVM_DEBUG({
+          dbgs() << "PICP: Updating definition: ";
+          I.printAsOperand(dbgs());
+          dbgs() << " = " << getByteDefinition(&I) << "\n";
+        });
+      }
+    }
+
+    PackedIntInsts.swap(CollectedInsts);
+
+    if (Converged) {
+      LLVM_DEBUG(dbgs() << "PICP: Reached fixpoint\n");
+      break;
+    }
+    if (NumIterations == MaxCollectionIterations) {
+      LLVM_DEBUG(dbgs() << "PICP: Reached maximum iteration limit\n");
+      break;
+    }
+
+    // Update final uses of values before their operands.
+    for (auto RI = WorkList.rbegin(); RI != WorkList.rend(); ++RI)
+      updateFinalUsers(*RI);
+    ++NumIterations;
+  }
+
+  LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << "\n");
+  return PackedIntInsts;
+}
+
+/// Return the value of all bits in a range, or std::nullopt if the bits vary.
+static std::optional<bool> checkAllBits(const SmallBitVector &Mask, unsigned Lo,
+                                        unsigned NumBits) {
+  bool Bit = Mask[Lo];
+  for (unsigned Idx = 1; Idx < NumBits; ++Idx)
+    if (Mask[Lo + Idx] != Bit)
+      return std::nullopt;
+  return Bit;
+}
+
+/// Structure for tracking the set of bytes of a final value which are produced
+/// by a given byte pack.
+struct PartialBytePack {
+
+  /// The value which produces the subset of bytes of a final value.
+  /// The byte pack is invalid if this pointer is null.
+  Value *BytePack;
+  /// A mask which identifies which bytes of a final value are provided by the
+  /// given byte pack. If a mask bit is not set, then the corresponding byte of
+  /// the byte pack must be zero.
+  SmallBitVector SetBytes;
+
+  PartialBytePack(Value *BytePack, SmallBitVector SetBytes)
+      : BytePack(BytePack), SetBytes(SetBytes) {}
+  PartialBytePack(Value *BytePack, unsigned NumBytes)
+      : BytePack(BytePack), SetBytes(NumBytes) {}
+
+  static PartialBytePack invalid() { return {nullptr, {}}; }
+
+  bool isValid() const { return BytePack != nullptr; }
+};
+
+/// Construct an integer whose bytes are set depending on the value of the
+/// corresponding \p Mask bit. A bit of \p Mask corresponds to an entire byte of
+/// the resulting APInt.
+static APInt createMaskConstant(unsigned BitWidth, const SmallBitVector &Mask) {
+  APInt BitMaskInt(BitWidth, 0);
+  for (unsigned ByteIdx : Mask.set_bits()) {
+    const unsigned BitIdx = ByteIdx * Byte::BitWidth;
+    if (BitIdx >= BitWidth)
+      break;
+    BitMaskInt.setBits(BitIdx, BitIdx + Byte::BitWidth);
+  }
+  return BitMaskInt;
+}
+
+/// Construct a mask whose bits correspond to vector elements identified by the
+/// \p ByteMask, or an empty vector if the \p Bytemask does not identify whole
+/// vector elements.
+static SmallBitVector getVectorElementMask(SmallBitVector &ByteMask,
+                                           unsigned NumBytesPerElement) {
+  if (ByteMask.size() % NumBytesPerElement != 0)
+    return {};
+  const unsigned NumElts = ByteMask.size() / NumBytesPerElement;
+
+  SmallBitVector EltMask;
+  EltMask.reserve(NumElts);
+  for (unsigned EltIdx = 0; EltIdx < NumElts; ++EltIdx) {
+    const std::optional<bool> Bits =
+        checkAllBits(ByteMask, EltIdx * NumBytesPerElement, NumBytesPerElement);
+    if (!Bits)
+      return {};
+    EltMask.push_back(*Bits);
+  }
+
+  assert(EltMask.size() == NumElts);
+  return EltMask;
+}
+
+/// A key for the CastCache of the BytePackFolder.
+struct CastEntry {
+  /// The value being casted.
+  Value *Base;
+  /// The type being casted into.
+  Type *CastTy;
+  /// The opcode of the cast instruction.
+  Instruction::CastOps OpCode;
+
+  struct MapInfo {
+    static CastEntry getEmptyKey() {
+      return {DenseMapInfo<Value *>::getEmptyKey(),
+              DenseMapInfo<Type *>::getEmptyKey(),
+              DenseMapInfo<Instruction::CastOps>::getEmptyKey()};
+    }
+    static CastEntry getTombstoneKey() {
+      return {DenseMapInfo<Value *>::getTombstoneKey(),
+              DenseMapInfo<Type *>::getTombstoneKey(),
+              DenseMapInfo<Instruction::CastOps>::getTombstoneKey()};
+    }
+    static unsigned getHashValue(const CastEntry &E) {
+      return hash_combine(
+          DenseMapInfo<Value *>::getHashValue(E.Base),
+          DenseMapInfo<Type *>::getHashValue(E.CastTy),
+          DenseMapInfo<Instruction::CastOps>::getHashValue(E.OpCode));
+    }
+    static bool isEqual(const CastEntry &Lhs, const CastEntry &Rhs) {
+      return Lhs.Base == Rhs.Base && Lhs.CastTy == Rhs.CastTy &&
+             Lhs.OpCode == Rhs.OpCode;
+    }
+  };
+};
+
+/// The class responsible for taking coalesced bytes and folding them together
+/// to produce the desired final value.
+///
+/// When coalesced bytes are pushed, they are promoted to the target type, and
+/// shifted to align the bytes to their corresponding offsets in the target
+/// value.
+class BytePackFolder {
+  /// The target final value to produce.
+  Instruction *TargetInst;
+  /// The layout of the target value.
+  ByteLayout Layout;
+  /// The collection of intermediate partial byte packs generated while folding
+  /// coalesced bytes.
+  std::vector<PartialBytePack> WorkList;
+  /// The list of non-cast instructions generated while folding coalesced bytes.
+  SmallVector<Instruction *> Insts;
+  /// A dedicated partial byte pack for collecting vector-aligned coalesced
+  /// bytes, if the target value is a vector type.
+  PartialBytePack VectorAlignedPack;
+  /// A cache holding all value casts needed, to avoid generating duplicate
+  /// casts.
+  MapVector<CastEntry, Instruction *,
+            DenseMap<CastEntry, unsigned, CastEntry::MapInfo>>
+      CastCache;
+
+  /// Create or reuse a cast of a given value.
+  Value *pushCast(Instruction::CastOps OpCode, Value *V, Type *DstTy) {
+    if (V->getType() == DstTy)
+      return V;
+
+    CastEntry E{V, DstTy, OpCode};
+    auto *It = CastCache.find(E);
+    if (It != CastCache.end())
+      return It->second;
+
+    auto *CI = CastInst::Create(OpCode, V, DstTy, V->getName() + ".cast");
+    CastCache[E] = CI;
+
+    LLVM_DEBUG({
+      dbgs() << "PICP [";
+      TargetInst->printAsOperand(dbgs());
+      dbgs() << "]: Queuing cast " << *CI << "\n";
+    });
+    return CI;
+  }
+
+  Instruction *pushInst(Instruction *I) {
+    // Cast instructions should be handled with pushCast.
+    assert(!isa<CastInst>(I));
+    Insts.push_back(I);
+
+    LLVM_DEBUG({
+      dbgs() << "PICP [";
+      TargetInst->printAsOperand(dbgs());
+      dbgs() << "]: Queuing inst " << *I << "\n";
+    });
+    return I;
+  }
+
+  /// Common functionality for promoting coalesced bytes to a vector.
+  bool pushToVectorImpl(Value *V, SmallBitVector &ByteMask, unsigned NumSrcElts,
+                        int ShrEltOffset, const Twine &Name) {
+    auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
+    auto *I32Ty = IntegerType::getInt32Ty(V->getContext());
+
+    // Try to push bytes to the vector-aligned builder.
+    SmallBitVector VecMask =
+        getVectorElementMask(ByteMask, Layout.NumBytesPerElement);
+    if (!VecMask.empty()) {
+      if (!VectorAlignedPack.isValid())
+        VectorAlignedPack = PartialBytePack(
+            ConstantVector::getNullValue(TargetVecTy), Layout.getNumBytes());
+
+      if (NumSrcElts == 1) {
+        // Insert a single element
+        assert(ShrEltOffset <= 0);
+        VectorAlignedPack.BytePack = pushInst(InsertElementInst::Create(
+            VectorAlignedPack.BytePack, V,
+            ConstantInt::get(I32Ty, -ShrEltOffset), Name + ".insert"));
+        VectorAlignedPack.SetBytes |= ByteMask;
+        return true;
+      }
+
+      assert(isa<FixedVectorType>(V->getType()));
+
+      if (NumSrcElts != Layout.NumVecElements) {
+        // We need to construct a vector of the same size as the vector-aligned
+        // byte pack before shuffling it in.
+        SmallVector<int> ExtractMask;
+        ExtractMask.reserve(Layout.NumVecElements);
+        for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) {
+          if (VecMask.test(EltIdx)) {
+            const int SrcIdx = static_cast<int>(EltIdx) + ShrEltOffset;
+            assert(SrcIdx >= 0);
+            ExtractMask.push_back(SrcIdx);
+          } else
+            ExtractMask.push_back(PoisonMaskElem);
+        }
+        assert(ExtractMask.size() == Layout.NumVecElements);
+
+        V = pushInst(new ShuffleVectorInst(V, ExtractMask, Name + ".extract"));
+        // We have accounted for the shift already, so no need to account for it
+        // when shuffling into the vector-aligned byte pack.
+        ShrEltOffset = 0;
+      }
+
+      assert(V->getType() == TargetVecTy);
+
+      if (VecMask.all()) {
+        VectorAlignedPack.BytePack = V;
+        VectorAlignedPack.SetBytes.set();
+        return true;
+      }
+
+      SmallVector<int> ShuffleMask;
+      ShuffleMask.reserve(Layout.NumVecElements);
+      for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) {
+        if (VecMask.test(EltIdx)) {
+          const int SrcIdx = static_cast<int>(EltIdx) + ShrEltOffset;
+          assert(SrcIdx >= 0);
+          ShuffleMask.push_back(SrcIdx);
+        } else
+          ShuffleMask.push_back(EltIdx + Layout.NumVecElements);
+      }
+      assert(ShuffleMask.size() == Layout.NumVecElements);
+
+      // We can shuffle directly into the vector-aligned byte pack.
+      VectorAlignedPack.BytePack = pushInst(new ShuffleVectorInst(
+          V, VectorAlignedPack.BytePack, ShuffleMask, Name + ".shuffle"));
+      VectorAlignedPack.SetBytes |= ByteMask;
+      return true;
+    }
+
+    // Otherwise, just extract and mask the relevant elements, and append to the
+    // worklist.
+
+    if (NumSrcElts == 1) {
+      assert(ShrEltOffset <= 0);
+      V = pushInst(InsertElementInst::Create(
+          ConstantVector::getNullValue(TargetVecTy), V,
+          ConstantInt::get(I32Ty, -ShrEltOffset), Name + ".insert"));
+    } else if (NumSrcElts != Layout.NumVecElements) {
+      SmallVector<int> ShuffleMask;
+      ShuffleMask.reserve(Layout.NumVecElements);
+      ShuffleMask.append(std::max(0, -ShrEltOffset), Layout.NumVecElements);
+      for (unsigned SrcIdx = std::max(0, ShrEltOffset);
+           SrcIdx < Layout.NumVecElements; ++SrcIdx)
+        ShuffleMask.push_back(SrcIdx);
+      ShuffleMask.append(std::max(0, ShrEltOffset), Layout.NumVecElements);
+      assert(ShuffleMask.size() == Layout.NumVecElements);
+
+      V = pushInst(
+          new ShuffleVectorInst(V, ConstantVector::getNullValue(V->getType()),
+                                ShuffleMask, Name + ".shuffle"));
+    }
+
+    assert(V->getType() == TargetVecTy);
+
+    const unsigned TargetBitWidth = Layout.getNumBytes() * Byte::BitWidth;
+    const unsigned TargetEltBitWidth =
+        Layout.NumBytesPerElement * Byte::BitWidth;
+    Type *TargetEltTy = TargetVecTy->getElementType();
+
+    APInt MaskBits = createMaskConstant(TargetBitWidth, ByteMask);
+    SmallVector<Constant *> EltwiseMask;
+    EltwiseMask.reserve(Layout.NumVecElements);
+    for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx)
+      EltwiseMask.push_back(ConstantInt::get(
+          TargetEltTy,
+          MaskBits.extractBits(TargetEltBitWidth, EltIdx * TargetEltBitWidth)));
+
+    V = pushInst(BinaryOperator::CreateAnd(V, ConstantVector::get(EltwiseMask),
+                                           Name + ".mask"));
+
+    WorkList.emplace_back(V, ByteMask);
+    return true;
+  }
+
+  bool pushIntegerToInteger(CoalescedBytes CB) {
+    assert(isa<IntegerType>(CB.Base->getType()));
+    auto *TargetIntTy = cast<IntegerType>(TargetInst->getType());
+
+    const unsigned NumTargetBytes = Layout.getNumBytes();
+    Value *V = CB.Base;
+    const unsigned NumSrcBytes = getByteLayout(V->getType()).getNumBytes();
+    const StringRef &Name = V->getName();
+
+    // Transformation: shr -> trunc -> mask -> zext -> shl
+    if (const unsigned ShrAmt = CB.getShrBits())
+      V = pushInst(BinaryOperator::CreateLShr(
+          V, ConstantInt::get(V->getType(), ShrAmt), Name + ".shift"));
+
+    if (NumSrcBytes > NumTargetBytes)
+      V = pushCast(Instruction::Trunc, V, TargetIntTy);
+
+    const unsigned ShlByteOffset = CB.getShlBytes();
+    const unsigned NumBytesToCheck = std::min(
+        ShlByteOffset < NumTargetBytes ? NumTargetBytes - ShlByteOffset : 0,
+        CB.getShrBytes() < NumSrcBytes ? NumSrcBytes - CB.getShrBytes() : 0);
+    if (!checkAllBits(CB.Mask, ShlByteOffset, NumBytesToCheck)) {
+      SmallBitVector RelMask = CB.Mask;
+      RelMask >>= ShlByteOffset;
+      Constant *Mask = ConstantInt::get(
+          V->getType(),
+          createMaskConstant(V->getType()->getIntegerBitWidth(), RelMask));
+      V = pushInst(BinaryOperator::CreateAnd(V, Mask, Name + ".mask"));
+    }
+
+    if (NumSrcBytes < NumTargetBytes)
+      V = pushCast(Instruction::ZExt, V, TargetIntTy);
+
+    if (const unsigned ShlAmt = CB.getShlBits())
+      V = pushInst(BinaryOperator::CreateShl(
+          V, ConstantInt::get(V->getType(), ShlAmt), Name + ".shift"));
+
+    WorkList.emplace_back(V, CB.Mask);
+    return true;
+  }
+
+  bool pushIntegerToVector(CoalescedBytes CB) {
+    assert(isa<IntegerType>(CB.Base->getType()));
+    auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
+    Type *TargetEltTy = TargetVecTy->getElementType();
+
+    Value *V = CB.Base;
+    const unsigned NumSrcBytes =
+        V->getType()->getIntegerBitWidth() / Byte::BitWidth;
+    const StringRef &Name = V->getName();
+
+    // Give up if bytes are obtained from a strange offset.
+    if (CB.SignedShrByteOffset % Layout.NumBytesPerElement != 0)
+      return {};
+    const int ShrEltOffset =
+        CB.SignedShrByteOffset / static_cast<int>(Layout.NumBytesPerElement);
+
+    // Give up if the source integer does not decompose naturally into vector
+    // elements.
+    if (NumSrcBytes % Layout.NumBytesPerElement != 0)
+      return {};
+    const unsigned NumSrcElts = NumSrcBytes / Layout.NumBytesPerElement;
+
+    if (NumSrcElts > 1) {
+      auto *CastTy = FixedVectorType::get(TargetEltTy, NumSrcElts);
+      V = pushCast(Instruction::BitCast, V, CastTy);
+    }
+
+    return pushToVectorImpl(V, CB.Mask, NumSrcElts, ShrEltOffset, Name);
+  }
+
+  bool pushVectorToInteger(CoalescedBytes CB) {
+    assert(isa<FixedVectorType>(CB.Base->getType()));
+    auto *TargetIntTy = cast<IntegerType>(TargetInst->getType());
+
+    const unsigned NumTargetBytes = Layout.getNumBytes();
+    Value *V = CB.Base;
+    const StringRef &Name = V->getName();
+    ByteLayout VecLayout = getByteLayout(V->getType());
+
+    // For sub-element accesses, try to subdivide the vector into smaller
+    // elements.
+    if (VecLayout.NumBytesPerElement > NumTargetBytes) {
+      if (VecLayout.NumBytesPerElement % NumTargetBytes != 0)
+        return {};
+
+      const unsigned SplitFactor =
+          VecLayout.NumBytesPerElement / NumTargetBytes;
+      auto *NewTy = FixedVectorType::get(TargetIntTy, VecLayout.NumVecElements *
+                                                          SplitFactor);
+      V = pushCast(Instruction::BitCast, V, NewTy);
+      VecLayout = getByteLayout(V->getType());
+    }
+
+    // Give up if bytes are obtained from a strange offset.
+    if (CB.SignedShrByteOffset % VecLayout.NumBytesPerElement != 0)
+      return {};
+
+    int ShrEltOffset =
+        CB.SignedShrByteOffset / static_cast<int>(VecLayout.NumBytesPerElement);
+
+    // Give up if the target integer does not decompose naturally into vector
+    // elements.
+    if (NumTargetBytes % VecLayout.NumBytesPerElement != 0)
+      return {};
+    const unsigned NumTargetElts =
+        NumTargetBytes / VecLayout.NumBytesPerElement;
+
+    auto *I32Ty = IntegerType::getInt32Ty(V->getContext());
+
+    // Coarsely isolate elements of interest, and use a bitmask to clean up the
+    // rest.
+    const bool NeedsBitMask = [&] {
+      if (NumTargetElts == 1) {
+        // Extract the unique relevant element
+        const int ExtractIdx = ShrEltOffset;
+        assert(ExtractIdx >= 0);
+        V = pushInst(ExtractElementInst::Create(
+            V, ConstantInt::get(I32Ty, ExtractIdx), Name + ".extract"));
+        ShrEltOffset = 0;
+        return !CB.Mask.all();
+      }
+
+      if (NumTargetElts != VecLayout.NumVecElements) {
+        bool IsVectorAligned = true;
+
+        // Extract all relevant elements into a shufflevector
+        SmallVector<int> ShuffleMask;
+        ShuffleMask.reserve(NumTargetElts);
+
+        for (unsigned EltIdx = 0; EltIdx < NumTargetElts; ++EltIdx) {
+          const std::optional<bool> EltMask =
+              checkAllBits(CB.Mask, EltIdx * VecLayout.NumBytesPerElement,
+                           VecLayout.NumBytesPerElement);
+
+          IsVectorAligned &= EltMask.has_value();
+          if (!EltMask || *EltMask) {
+            const int ExtractIdx = static_cast<int>(EltIdx) + ShrEltOffset;
+            assert(ExtractIdx >= 0);
+            ShuffleMask.push_back(ExtractIdx);
+          } else {
+            ShuffleMask.push_back(VecLayout.NumVecElements);
+          }
+        }
+
+        V = pushInst(
+            new ShuffleVectorInst(V, ConstantVector::getNullValue(V->getType()),
+                                  ShuffleMask, Name + ".shuffle"));
+        V = pushCast(Instruction::BitCast, V, TargetIntTy);
+
+        ShrEltOffset = 0;
+        return !IsVectorAligned;
+      }
+
+      V = pushCast(Instruction::BitCast, V, TargetIntTy);
+      return !CB.Mask.all();
+    }();
+
+    assert(V->getType() == TargetIntTy);
+
+    const int ShrBitOffset = ShrEltOffset *
+                             static_cast<int>(VecLayout.NumBytesPerElement) *
+                             static_cast<int>(Byte::BitWidth);
+    if (ShrBitOffset > 0)
+      V = pushInst(BinaryOperator::CreateLShr(
+          V, ConstantInt::get(V->getType(), ShrBitOffset), Name + ".shift"));
+    else if (ShrBitOffset < 0)
+      V = pushInst(BinaryOperator::CreateShl(
+          V, ConstantInt::get(V->getType(), -ShrBitOffset), Name + ".shift"));
+
+    if (NeedsBitMask) {
+      // Mask out unwanted bytes.
+      Constant *Mask = ConstantInt::get(
+          TargetIntTy, createMaskConstant(TargetIntTy->getBitWidth(), CB.Mask));
+      V = pushInst(BinaryOperator::CreateAnd(V, Mask, Name + ".mask"));
+    }
+
+    WorkList.emplace_back(V, CB.Mask);
+    return true;
+  }
+
+  bool pushVectorToVector(CoalescedBytes CB) {
+    assert(isa<FixedVectorType>(CB.Base->getType()));
+    auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
+    Type *TargetEltTy = TargetVecTy->getElementType();
+
+    const ByteLayout SrcLayout = getByteLayout(CB.Base->getType());
+    Value *V = CB.Base;
+    const StringRef &Name = V->getName();
+
+    // Give up if the source vector cannot be converted to match the elements of
+    // the target vector.
+    if (SrcLayout.getNumBytes() % Layout.NumBytesPerElement != 0)
+      return {};
+    const unsigned NumSrcElts =
+        SrcLayout.getNumBytes() / Layout.NumBytesPerElement;
+
+    // Give up if the shift amount is not aligned to the target vector.
+    if (CB.SignedShrByteOffset % Layout.NumBytesPerElement != 0)
+      return {};
+
+    const int ShrEltOffset =
+        CB.SignedShrByteOffset / static_cast<int>(Layout.NumBytesPerElement);
+
+    Type *SrcTy;
+    if (NumSrcElts > 1)
+      SrcTy = FixedVectorType::get(TargetEltTy, NumSrcElts);
+    else
+      SrcTy = TargetEltTy;
+
+    V = pushCast(Instruction::BitCast, V, SrcTy);
+
+    return pushToVectorImpl(V, CB.Mask, NumSrcElts, ShrEltOffset, Name);
+  }
+
+  PartialBytePack mergeIntegerPacks(PartialBytePack &Lhs,
+                                    PartialBytePack &Rhs) {
+    assert(isa<IntegerType>(Lhs.BytePack->getType()) &&
+           isa<IntegerType>(Rhs.BytePack->getType()));
+    Value *Merge = pushInst(BinaryOperator::CreateDisjointOr(
+        Lhs.BytePack, Rhs.BytePack, TargetInst->getName() + ".merge", nullptr));
+    return {Merge, Lhs.SetBytes | Rhs.SetBytes};
+  }
+
+  PartialBytePack mergeVectorPacks(PartialBytePack &Lhs, PartialBytePack &Rhs) {
+    assert(isa<FixedVectorType>(Lhs.BytePack->getType()) &&
+           isa<FixedVectorType>(Rhs.BytePack->getType()));
+    SmallVector<int> ShuffleMask;
+    ShuffleMask.reserve(Layout.NumVecElements);
+    for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx) {
+      const unsigned Lo = EltIdx * Layout.NumBytesPerElement;
+      const std::optional<bool> LhsBits =
+          checkAllBits(Lhs.SetBytes, Lo, Layout.NumBytesPerElement);
+      if (!LhsBits) {
+        const std::optional<bool> RhsBits =
+            checkAllBits(Rhs.SetBytes, Lo, Layout.NumBytesPerElement);
+        if (!RhsBits) {
+          ShuffleMask.clear();
+          break;
+        }
+        ShuffleMask.push_back(*RhsBits ? EltIdx + Layout.NumVecElements
+                                       : EltIdx);
+        continue;
+      }
+
+      ShuffleMask.push_back(*LhsBits ? EltIdx : EltIdx + Layout.NumVecElements);
+    }
+
+    const Twine &Name = TargetInst->getName() + ".merge";
+    Value *Merge;
+    if (ShuffleMask.empty())
+      Merge = pushInst(BinaryOperator::CreateDisjointOr(
+          Lhs.BytePack, Rhs.BytePack, Name, nullptr));
+    else
+      Merge = pushInst(
+          new ShuffleVectorInst(Lhs.BytePack, Rhs.BytePack, ShuffleMask, Name));
+
+    return {Merge, Lhs.SetBytes | Rhs.SetBytes};
+  }
+
+public:
+  BytePackFolder(Instruction *TargetV)
+      : TargetInst(TargetV), Layout(getByteLayout(TargetV->getType())),
+        VectorAlignedPack(PartialBytePack::invalid()) {}
+
+  ~BytePackFolder() {
+    /// If instructions are not committed, they need to be cleaned up.
+
+    for (auto &[_, I] : CastCache) {
+      LLVM_DEBUG({
+        dbgs() << "PICP [";
+        TargetInst->printAsOperand(dbgs());
+        dbgs() << "]: Dequeuing cast " << *I << "\n";
+      });
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+      I->deleteValue();
+    }
+
+    while (!Insts.empty()) {
+      LLVM_DEBUG({
+        dbgs() << "PICP [";
+        TargetInst->printAsOperand(dbgs());
+        dbgs() << "]: Dequeuing inst " << *Insts.back() << "\n";
+      });
+      Insts.back()->deleteValue();
+      Insts.pop_back();
+    }
+  }
+
+  /// Try to generate instructions for coalescing the given bytes and aligning
+  /// them to the target value. Returns true iff this is successful.
+  bool pushCoalescedBytes(CoalescedBytes CB) {
+    if (isa<Constant>(CB.Base) && CB.SignedShrByteOffset == 0) {
+      WorkList.emplace_back(CB.Base, CB.Mask);
+      return true;
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "PICP [";
+      TargetInst->printAsOperand(dbgs());
+      dbgs() << "]: Preparing bytes " << CB << "\n";
+    });
+    if (isa<FixedVectorType>(TargetInst->getType())) {
+      if (isa<FixedVectorType>(CB.Base->getType()))
+        return pushVectorToVector(CB);
+
+      return pushIntegerToVector(CB);
+    }
+
+    if (isa<FixedVectorType>(CB.Base->getType()))
+      return pushVectorToInteger(CB);
+
+    return pushIntegerToInteger(CB);
+  }
+
+  /// After coalescing all byte packs individually, this folds the coalesced
+  /// byte packs together to (re)produce the final value and return it.
+  Value *foldBytePacks(IRBuilder<> &IRB) {
+    Type *TargetTy = TargetInst->getType();
+
+    if (VectorAlignedPack.isValid()) {
+      WorkList.push_back(VectorAlignedPack);
+      VectorAlignedPack = PartialBytePack::invalid();
+    }
+
+    while (WorkList.size() > 1) {
+      std::vector<PartialBytePack> NewWorkList;
+      NewWorkList.reserve((WorkList.size() + 1) / 2);
+
+      for (unsigned Item = 0; Item + 1 < WorkList.size(); Item += 2) {
+        PartialBytePack &Lhs = WorkList[Item];
+        PartialBytePack &Rhs = WorkList[Item + 1];
+        NewWorkList.push_back(isa<FixedVectorType>(TargetTy)
+                                  ? mergeVectorPacks(Lhs, Rhs)
+                                  : mergeIntegerPacks(Lhs, Rhs));
+      }
+      if (WorkList.size() % 2 == 1)
+        NewWorkList.push_back(WorkList.back());
+
+      WorkList.swap(NewWorkList);
+    }
+
+    IRB.SetInsertPoint(TargetInst);
+    for (Value *I : Insts)
+      IRB.Insert(I, I->getName());
+
+    Insts.clear();
+
+    for (auto &[E, I] : CastCache) {
+      if (auto *BaseI = dyn_cast<Instruction>(E.Base))
+        I->insertInto(BaseI->getParent(), *BaseI->getInsertionPointAfterDef());
+      else {
+        BasicBlock &BB = TargetInst->getFunction()->getEntryBlock();
+        I->insertInto(&BB, BB.getFirstInsertionPt());
+      }
+    }
+
+    CastCache.clear();
+
+    // Note: WorkList may be empty if the value is known to be zero.
+    return WorkList.empty() ? Constant::getNullValue(TargetTy)
+                            : WorkList.back().BytePack;
+  }
+};
+
+/// A final value (or an operand thereof, if the rewriter is not aggressive)
+/// queued up to be reconstructed.
+struct PackedIntInstruction {
+  /// The target value to reconstruct.
+  Instruction *TargetInst;
+  /// The chosen partitioning of its bytes.
+  SmallVector<CoalescedBytes, 8> CBV;
+
+  PackedIntInstruction(Instruction &I, SmallVector<CoalescedBytes, 8> &&CBV)
+      : TargetInst(&I), CBV(CBV) {}
+
+  /// Try to reconstruct a value given its coalesced byte partitioning,
+  /// returning the reconstructed value on success, or a nullptr on failure.
+  Value *rewrite(IRBuilder<> &IRB) const {
+    BytePackFolder BPF(TargetInst);
+    for (const CoalescedBytes &CB : CBV) {
+      if (!BPF.pushCoalescedBytes(CB)) {
+        LLVM_DEBUG(dbgs() << "PICP: Coalescing rejected!\n");
+        return nullptr;
+      }
+    }
+
+    return BPF.foldBytePacks(IRB);
+  }
+};
+
+/// Coalesce the bytes in a definition into a partition for rewriting.
+/// If the rewriter is non-aggressive, return nullopt if the rewriting is
+/// determined to be unnecessary.
+static std::optional<SmallVector<CoalescedBytes, 8>>
+getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
+  const ByteLayout Layout = getByteLayout(Ty);
+  assert(Layout.getNumBytes() == BV.size() &&
+         "Byte definition has unexpected width.");
+
+  SmallVector<CoalescedBytes, 8> CBV;
+  SmallVector<int, 8> CBVOperands;
+  const unsigned BitWidth = Layout.getNumBytes() * Byte::BitWidth;
+  APInt ConstBits(BitWidth, 0);
+  SmallBitVector ConstBytes(BV.size());
+
+  bool OperandsAlreadyCoalesced = true;
+  bool UsesSingleSource = true;
+  for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx) {
+    const ByteUse &BU = BV[ByteIdx];
+    const Byte &B = BU.getByte();
+    if (B.isConstant()) {
+      const unsigned Const = B.getConstant();
+      if (!Const)
+        continue;
+
+      ConstBits.insertBits(Const, ByteIdx * Byte::BitWidth, Byte::BitWidth);
+      ConstBytes.set(ByteIdx);
+    } else {
+      CoalescedBytes *CB = nullptr;
+      Value *Base = B.getBase();
+      const signed Offset =
+          static_cast<signed>(B.getIndex()) - static_cast<signed>(ByteIdx);
+      for (unsigned CBIdx = 0; CBIdx < CBV.size(); ++CBIdx) {
+        if (CBV[CBIdx].alignsWith(Base, Offset)) {
+          CB = &CBV[CBIdx];
+          int &OpIdx = CBVOperands[CBIdx];
+          if (OpIdx < 0)
+            OpIdx = BU.getOperandIndex();
+          else if (BU.getOperandIndex() >= 0 && OpIdx != BU.getOperandIndex()) {
+            LLVM_DEBUG(dbgs()
+                       << "PICP: Bytes " << *CB << " from operand " << OpIdx
+                       << " can be coalesced with byte " << B
+                       << " from operand " << BU.getOperandIndex() << "\n");
+            OperandsAlreadyCoalesced = false;
+          }
+        }
+      }
+
+      if (!CB) {
+        CB = &CBV.emplace_back(*Base, Offset, BV.size());
+        CBVOperands.push_back(BU.getOperandIndex());
+      }
+
+      UsesSingleSource &= CB->Base == CBV.front().Base;
+      CB->Mask.set(ByteIdx);
+    }
+  }
+
+  if (!AggressiveRewriting) {
+    if (OperandsAlreadyCoalesced && !CBV.empty()) {
+      // If packed bytes from the same source and offset are not split between
+      // operands, then this instruction does not need to be rewritten.
+      LLVM_DEBUG(dbgs() << "PICP: Operands are already coalesced.\n");
+      return std::nullopt;
+    }
+    if (UsesSingleSource && CBV.size() > 1) {
+      // If packed bytes come from the same source, but cannot be coalesced
+      // (e.g., bytes from one operand are shuffled), then rewriting this
+      // instruction may lead to strange IR.
+      LLVM_DEBUG(
+          dbgs()
+          << "PICP: Instruction rearranges bytes from a single source.\n");
+      return std::nullopt;
+    }
+  }
+
+  // The CBV will be used for rewriting; append the constant value that was also
+  // accumulated, if nonzero.
+  if (ConstBytes.any()) {
+    // Create initial constant as desired type.
+    if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+      SmallVector<Constant *> EltwiseMask;
+      const unsigned NumBitsPerElt = Layout.NumBytesPerElement * Byte::BitWidth;
+      EltwiseMask.reserve(Layout.NumVecElements);
+      for (unsigned EltIdx = 0; EltIdx < Layout.NumVecElements; ++EltIdx)
+        EltwiseMask.push_back(ConstantInt::get(
+            VecTy->getElementType(),
+            ConstBits.extractBits(NumBitsPerElt, EltIdx * NumBitsPerElt)));
+
+      CBV.emplace_back(*ConstantVector::get(EltwiseMask), 0, ConstBytes);
+    } else
+      CBV.emplace_back(*ConstantInt::get(Ty, ConstBits), 0, ConstBytes);
+  }
+
+  return CBV;
+}
+
+/// Queue into \p PIIV the set of final values (or operands thereof, if the
+/// rewriter is non-aggressive) which are deemed beneficial to rewrite.
+static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
+                           Instruction &FinalInst, ByteExpander &BE) {
+  SmallVector<Instruction *, 8> WorkList{&FinalInst};
+  SmallPtrSet<Instruction *, 8> Seen{&FinalInst};
+
+  do {
+    Instruction *I = WorkList.back();
+    WorkList.pop_back();
+
+    const ByteVector *BV = BE.expandByteDefinition(I);
+    if (!BV)
+      // This instruction is beyond the analysis scope of PICP.
+      continue;
+
+    LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << "\n"
+                      << "             byte pack: " << BE.getByteDefinition(I)
+                      << "\n");
+    auto CBV = [&]() -> std::optional<SmallVector<CoalescedBytes, 8>> {
+      // Short-circuit check for casts.
+      if (!AggressiveRewriting && I->getNumOperands() == 1)
+        return std::nullopt;
+
+      return getCoalescingOpportunity(I->getType(), *BV);
+    }();
+
+    if (!CBV) {
+      // Narrow rewriting to the operands of this instruction instead.
+      for (Use &U : I->operands())
+        if (auto *Op = dyn_cast<Instruction>(U.get()))
+          if (Seen.insert(Op).second)
+            WorkList.push_back(Op);
+      continue;
+    }
+
+    PIIV.emplace_back(*I, std::move(*CBV));
+  } while (!WorkList.empty());
+}
+
+static bool runImpl(Function &F) {
+  ByteExpander BE;
+
+  std::vector<Instruction *> PIICandidates = BE.collectPIICandidates(F);
+  std::vector<PackedIntInstruction> PIIV;
+
+  for (Instruction *I : PIICandidates) {
+    if (!BE.checkIfIntermediate(I))
+      queueRewriting(PIIV, *I, BE);
+    else
+      LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << "\n"
+                        << "            final user: "
+                        << **BE.getFinalUsers(I).begin() << "\n");
+  }
+
+  DenseMap<Instruction *, Value *> InstSubs;
+  IRBuilder<> IRB(F.getContext());
+  for (const PackedIntInstruction &PII : PIIV)
+    if (Value *V = PII.rewrite(IRB)) {
+      LLVM_DEBUG(dbgs() << "PICP rewrite successful for " << *PII.TargetInst
+                        << "\n");
+      InstSubs[PII.TargetInst] = V;
+    }
+
+  if (InstSubs.empty())
+    return false;
+
+  for (auto &[OldI, NewV] : InstSubs)
+    OldI->replaceAllUsesWith(NewV);
+
+  for (auto RIt = PIICandidates.rbegin(); RIt != PIICandidates.rend(); ++RIt) {
+    Instruction *I = *RIt;
+    if (I->getNumUses() == 0)
+      I->eraseFromParent();
+  }
+  return true;
+}
+
+class PackedIntegerCombineLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PackedIntegerCombineLegacyPass() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override { return runImpl(F); }
+};
+char PackedIntegerCombineLegacyPass::ID = 0;
+
+} // namespace
+
+PreservedAnalyses PackedIntegerCombinePass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  if (!runImpl(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+INITIALIZE_PASS(PackedIntegerCombineLegacyPass, DEBUG_TYPE,
+                "Packed Integer Combine", false, false)
+
+FunctionPass *llvm::createPackedIntegerCombinePass() {
+  return new PackedIntegerCombineLegacyPass();
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c7e4a3e824700..2c2b8964af8d5 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -36,6 +36,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLowerAtomicLegacyPassPass(Registry);
   initializeMergeICmpsLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
+  initializePackedIntegerCombineLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
   initializeReassociateLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index 93b5f155fc81e..2cd541363d44d 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -205,12 +205,12 @@ define i64 @load_3xi16_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_3xi16_combine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v2, v[0:1], off
 ; GCN-NEXT:    global_load_ushort v3, v[0:1], off offset:4
+; GCN-NEXT:    global_load_dword v2, v[0:1], off
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
   %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index af3241e95e91d..801712ba90988 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -541,6 +541,7 @@
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Scalar Evolution Analysis
 ; GCN-O1-OPTS-NEXT:        GPU Load and Store Vectorizer
+; GCN-O1-OPTS-NEXT:        Packed Integer Combine
 ; GCN-O1-OPTS-NEXT:        Lazy Value Information Analysis
 ; GCN-O1-OPTS-NEXT:        Lower SwitchInst's to branches
 ; GCN-O1-OPTS-NEXT:        Lower invoke and unwind, for unwindless code generators
@@ -856,6 +857,7 @@
 ; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Scalar Evolution Analysis
 ; GCN-O2-NEXT:        GPU Load and Store Vectorizer
+; GCN-O2-NEXT:        Packed Integer Combine
 ; GCN-O2-NEXT:        Lazy Value Information Analysis
 ; GCN-O2-NEXT:        Lower SwitchInst's to branches
 ; GCN-O2-NEXT:        Lower invoke and unwind, for unwindless code generators
@@ -1186,6 +1188,7 @@
 ; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Scalar Evolution Analysis
 ; GCN-O3-NEXT:        GPU Load and Store Vectorizer
+; GCN-O3-NEXT:        Packed Integer Combine
 ; GCN-O3-NEXT:        Lazy Value Information Analysis
 ; GCN-O3-NEXT:        Lower SwitchInst's to branches
 ; GCN-O3-NEXT:        Lower invoke and unwind, for unwindless code generators
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 38e45042b5ee4..d13f5d569aae6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1867,27 +1867,9 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; NOSDWA-NEXT:    v_mov_b32_e32 v0, s0
 ; NOSDWA-NEXT:    v_mov_b32_e32 v1, s1
 ; NOSDWA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; NOSDWA-NEXT:    v_mov_b32_e32 v3, s3
 ; NOSDWA-NEXT:    v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT:    v_mov_b32_e32 v3, s3
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
-; NOSDWA-NEXT:    v_readfirstlane_b32 s0, v1
-; NOSDWA-NEXT:    v_readfirstlane_b32 s1, v0
-; NOSDWA-NEXT:    s_lshr_b32 s3, s1, 24
-; NOSDWA-NEXT:    s_lshr_b32 s5, s0, 24
-; NOSDWA-NEXT:    s_and_b32 s2, s1, 0xffff
-; NOSDWA-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; NOSDWA-NEXT:    s_and_b32 s4, s0, 0xffff
-; NOSDWA-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; NOSDWA-NEXT:    s_lshl_b32 s3, s3, 8
-; NOSDWA-NEXT:    s_lshl_b32 s5, s5, 8
-; NOSDWA-NEXT:    s_or_b32 s1, s1, s3
-; NOSDWA-NEXT:    s_or_b32 s0, s0, s5
-; NOSDWA-NEXT:    s_lshl_b32 s1, s1, 16
-; NOSDWA-NEXT:    s_lshl_b32 s0, s0, 16
-; NOSDWA-NEXT:    s_or_b32 s1, s2, s1
-; NOSDWA-NEXT:    s_or_b32 s0, s4, s0
-; NOSDWA-NEXT:    v_mov_b32_e32 v0, s1
-; NOSDWA-NEXT:    v_mov_b32_e32 v1, s0
 ; NOSDWA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; NOSDWA-NEXT:    s_endpgm
 ;
@@ -1898,85 +1880,21 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX89-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX89-NEXT:    v_mov_b32_e32 v2, s2
+; GFX89-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX89-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX89-NEXT:    s_lshr_b32 s3, s1, 24
-; GFX89-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX89-NEXT:    s_and_b32 s2, s1, 0xffff
-; GFX89-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX89-NEXT:    s_and_b32 s4, s0, 0xffff
-; GFX89-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX89-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX89-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX89-NEXT:    s_or_b32 s1, s1, s3
-; GFX89-NEXT:    s_or_b32 s0, s0, s5
-; GFX89-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX89-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX89-NEXT:    s_or_b32 s1, s2, s1
-; GFX89-NEXT:    s_or_b32 s0, s4, s0
-; GFX89-NEXT:    v_mov_b32_e32 v0, s1
-; GFX89-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX89-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX89-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: pulled_out_test:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_lshr_b32 s5, s1, 24
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 24
-; GFX9-NEXT:    s_and_b32 s4, s1, 0xffff
-; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_or_b32 s1, s1, s5
-; GFX9-NEXT:    s_or_b32 s0, s0, s7
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_or_b32 s1, s4, s1
-; GFX9-NEXT:    s_or_b32 s0, s6, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: pulled_out_test:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX10-NEXT:    s_and_b32 s4, s0, 0xffff
-; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX10-NEXT:    s_and_b32 s6, s1, 0xffff
-; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX10-NEXT:    s_or_b32 s0, s0, s5
-; GFX10-NEXT:    s_or_b32 s1, s1, s7
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s0, s4, s0
-; GFX10-NEXT:    s_or_b32 s1, s6, s1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX10-NEXT:    s_endpgm
+; GFX9_10-LABEL: pulled_out_test:
+; GFX9_10:       ; %bb.0: ; %entry
+; GFX9_10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9_10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9_10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9_10-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9_10-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9_10-NEXT:    s_endpgm
 entry:
   %idxprom = ashr exact i64 15, 32
   %arrayidx = getelementptr inbounds <8 x i8>, ptr addrspace(1) %sourceA, i64 %idxprom
@@ -2297,5 +2215,4 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9_10: {{.*}}
 ; SDWA: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..fc990283b004c 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
   ; CHECK-NEXT:   undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   KILL undef %125:sgpr_128
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %119:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   KILL undef %119:sgpr_128
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -44,37 +44,38 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
-  ; CHECK-NEXT:   KILL undef %74:sreg_64
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.78, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %68:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
+  ; CHECK-NEXT:   KILL undef %68:sreg_64
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %83:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL undef %89:sgpr_128
-  ; CHECK-NEXT:   KILL undef %118:sgpr_128
+  ; CHECK-NEXT:   KILL undef %83:sgpr_128
+  ; CHECK-NEXT:   KILL undef %112:sgpr_128
   ; CHECK-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
-  ; CHECK-NEXT:   KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.84, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.90, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
   ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %163:sreg_32, 31, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %163:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.96, addrspace 4)
   ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %163:sreg_32, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
@@ -87,20 +88,21 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %296:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %351:sgpr_128, undef %352:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %362:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.104, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %346:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
@@ -114,19 +116,19 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %378:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.131, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.142, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.137, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.154, addrspace 4)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
@@ -138,50 +140,47 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.270, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4)
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.279, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
-  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub3
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -191,33 +190,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.287, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub2:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub3:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub3
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM2]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.253, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %467:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   KILL undef %467:sreg_64
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX2_IMM2]].sub0_sub1_sub2, [[S_LOAD_DWORDX2_IMM2]].sub3
   ; CHECK-NEXT:   KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
-  ; CHECK-NEXT:   KILL undef %470:sreg_64
-  ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
   ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.262, addrspace 4)
   ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.296, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
-  ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
@@ -226,20 +224,20 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.314, addrspace 4)
   ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.320, addrspace 4)
   ; CHECK-NEXT:   undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM25]]
-  ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM26]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
+  ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -267,9 +265,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec
   ; CHECK-NEXT:   [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec
-  ; CHECK-NEXT:   [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec
-  ; CHECK-NEXT:   [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_SUBREV_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 51, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_14]], [[V_SUBREV_U32_e64_9]], implicit $exec
   ; CHECK-NEXT:   [[V_SUBREV_U32_e64_11:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 52, [[BUFFER_LOAD_FORMAT_X_IDXEN10]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_15]], [[V_SUBREV_U32_e64_10]], implicit $exec
@@ -351,13 +349,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %540:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
   ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %554:vgpr_32, undef %556:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index c554fdbf4c799..9aba7e030ec19 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -207,6 +207,7 @@
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 62bb02d9b3c40..3ed2950fa9ce2 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -133,6 +133,7 @@
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 0da7a9f73bdce..bc04d1b35bb82 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -118,6 +118,7 @@
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 38b7890682783..772b3cc4a5fb6 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -127,6 +127,7 @@
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 5aacd26def2be..84d62d56c3a9b 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -165,6 +165,7 @@
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index f6a9406596803..6e7b973213494 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -167,6 +167,7 @@
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 48a9433d24999..10e3f7c6e1fb6 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -131,6 +131,7 @@
 ; CHECK-O-NEXT: Running pass: SCCPPass
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+; CHECK-O-NEXT: Running pass: PackedIntegerCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
new file mode 100644
index 0000000000000..e7b6a6bc66fa1
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
@@ -0,0 +1,601 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @add.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @add.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711936
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT:    [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT:    ret i32 [[ADD]]
+;
+; AGGRESSIVE-LABEL: define i32 @add.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[B_MASK2:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT:    [[A_MASK4:%.*]] = and i32 [[A]], -16711936
+; AGGRESSIVE-NEXT:    [[ADD_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
+; AGGRESSIVE-NEXT:    ret i32 [[ADD_MERGE]]
+;
+  %a.mask = and i32 %a, u0xff00ff00
+  %b.mask = and i32 %b, u0x00ff00ff
+  %add = add i32 %a.mask, %b.mask
+  ret i32 %add
+}
+
+;; u0xff00ffff = -16711681
+;; u0x00ff00ff = 16711935
+;; Nothing happens in this case because of the overlapping bytes.
+define i32 @add.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @add.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT:    [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT:    ret i32 [[ADD]]
+;
+; AGGRESSIVE-LABEL: define i32 @add.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT:    [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT:    [[ADD:%.*]] = add i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT:    ret i32 [[ADD]]
+;
+  %a.mask = and i32 %a, u0xff00ffff
+  %b.mask = and i32 %b, u0x00ff00ff
+  %add = add i32 %a.mask, %b.mask
+  ret i32 %add
+}
+
+define i32 @and.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @and.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    ret i32 0
+;
+; AGGRESSIVE-LABEL: define i32 @and.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    ret i32 0
+;
+  %a.mask = and i32 %a, u0xff00ff00
+  %b.mask = and i32 %b, u0x00ff00ff
+  %and = and i32 %a.mask, %b.mask
+  ret i32 %and
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @and.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @and.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT:    [[AND:%.*]] = and i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT:    ret i32 [[AND]]
+;
+; AGGRESSIVE-LABEL: define i32 @and.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT:    [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT:    [[AND:%.*]] = and i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT:    ret i32 [[AND]]
+;
+  %a.mask = and i32 %a, u0xff00ffff
+  %b.mask = and i32 %b, u0x00ff00ff
+  %and = and i32 %a.mask, %b.mask
+  ret i32 %and
+}
+
+define i32 @and.2(i32 %x) {
+; LAZY-LABEL: define i32 @and.2(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT:    [[X_0:%.*]] = and i32 [[X]], -16711681
+; LAZY-NEXT:    [[X_1:%.*]] = and i32 [[X]], 16711935
+; LAZY-NEXT:    [[AND:%.*]] = and i32 [[X_0]], [[X_1]]
+; LAZY-NEXT:    ret i32 [[AND]]
+;
+; AGGRESSIVE-LABEL: define i32 @and.2(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT:    [[X_MASK:%.*]] = and i32 [[X]], 255
+; AGGRESSIVE-NEXT:    ret i32 [[X_MASK]]
+;
+  %x.0 = and i32 %x, u0xff00ffff
+  %x.1 = and i32 %x, u0x00ff00ff
+  %and = and i32 %x.0, %x.1
+  ret i32 %and
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @or.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @or.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711936
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT:    [[OR:%.*]] = or i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT:    ret i32 [[OR]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[B_MASK2:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT:    [[A_MASK4:%.*]] = and i32 [[A]], -16711936
+; AGGRESSIVE-NEXT:    [[OR_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
+; AGGRESSIVE-NEXT:    ret i32 [[OR_MERGE]]
+;
+  %a.mask = and i32 %a, u0xff00ff00
+  %b.mask = and i32 %b, u0x00ff00ff
+  %or = or i32 %a.mask, %b.mask
+  ret i32 %or
+}
+
+;; u0xff00ffff = -16711681
+;; u0x00ff00ff = 16711935
+;; Nothing happens in this case because of the overlapping bytes.
+define i32 @or.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @or.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT:    [[OR:%.*]] = or i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT:    ret i32 [[OR]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT:    [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT:    [[OR:%.*]] = or i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT:    ret i32 [[OR]]
+;
+  %a.mask = and i32 %a, u0xff00ffff
+  %b.mask = and i32 %b, u0x00ff00ff
+  %or = or i32 %a.mask, %b.mask
+  ret i32 %or
+}
+
+define i32 @or.2(i32 %x) {
+; LAZY-LABEL: define i32 @or.2(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT:    ret i32 [[X]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.2(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT:    ret i32 [[X]]
+;
+  %x.0 = and i32 %x, u0xff00ffff
+  %x.1 = and i32 %x, u0x00ff00ff
+  %or = or i32 %x.0, %x.1
+  ret i32 %or
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @or.3(i32 %x) {
+; LAZY-LABEL: define i32 @or.3(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT:    [[X_0:%.*]] = or i32 [[X]], 16711935
+; LAZY-NEXT:    ret i32 [[X_0]]
+;
+; AGGRESSIVE-LABEL: define i32 @or.3(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT:    [[X_MASK:%.*]] = and i32 [[X]], -16711936
+; AGGRESSIVE-NEXT:    [[X_0_MERGE:%.*]] = or disjoint i32 [[X_MASK]], 16711935
+; AGGRESSIVE-NEXT:    ret i32 [[X_0_MERGE]]
+;
+  %x.0 = or i32 %x, u0x00ff00ff
+  ret i32 %x.0
+}
+
+;; u0xff00ff00 = -16711936
+;; u0x00ff00ff = 16711935
+define i32 @xor.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @xor.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711936
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT:    [[XOR:%.*]] = xor i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT:    ret i32 [[XOR]]
+;
+; AGGRESSIVE-LABEL: define i32 @xor.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[B_MASK2:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT:    [[A_MASK4:%.*]] = and i32 [[A]], -16711936
+; AGGRESSIVE-NEXT:    [[XOR_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
+; AGGRESSIVE-NEXT:    ret i32 [[XOR_MERGE]]
+;
+  %a.mask = and i32 %a, u0xff00ff00
+  %b.mask = and i32 %b, u0x00ff00ff
+  %xor = xor i32 %a.mask, %b.mask
+  ret i32 %xor
+}
+
+;; u0xff00ffff = -16711681
+;; u0x00ff00ff = 16711935
+;; Nothing happens in this case because of the overlapping bytes.
+define i32 @xor.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @xor.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711681
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
+; LAZY-NEXT:    [[XOR:%.*]] = xor i32 [[A_MASK]], [[B_MASK]]
+; LAZY-NEXT:    ret i32 [[XOR]]
+;
+; AGGRESSIVE-LABEL: define i32 @xor.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_MASK2:%.*]] = and i32 [[A]], -16711681
+; AGGRESSIVE-NEXT:    [[B_MASK4:%.*]] = and i32 [[B]], 16711935
+; AGGRESSIVE-NEXT:    [[XOR:%.*]] = xor i32 [[A_MASK2]], [[B_MASK4]]
+; AGGRESSIVE-NEXT:    ret i32 [[XOR]]
+;
+  %a.mask = and i32 %a, u0xff00ffff
+  %b.mask = and i32 %b, u0x00ff00ff
+  %xor = xor i32 %a.mask, %b.mask
+  ret i32 %xor
+}
+
+define i32 @xor.2(i32 %x) {
+; LAZY-LABEL: define i32 @xor.2(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT:    [[X_MASK:%.*]] = and i32 [[X]], -256
+; LAZY-NEXT:    ret i32 [[X_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @xor.2(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT:    [[X_MASK:%.*]] = and i32 [[X]], -256
+; AGGRESSIVE-NEXT:    ret i32 [[X_MASK]]
+;
+  %x.0 = and i32 %x, u0xff00ffff
+  %x.1 = and i32 %x, u0x00ff00ff
+  %xor = xor i32 %x.0, %x.1
+  ret i32 %xor
+}
+
+define i32 @shl.0(i32 %base) {
+; LAZY-LABEL: define i32 @shl.0(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT:    [[PRE:%.*]] = and i32 [[BASE]], 16777215
+; LAZY-NEXT:    [[SHL:%.*]] = shl i32 [[PRE]], 8
+; LAZY-NEXT:    [[POST:%.*]] = and i32 [[SHL]], -256
+; LAZY-NEXT:    ret i32 [[POST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shl.0(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BASE_SHIFT:%.*]] = shl i32 [[BASE]], 8
+; AGGRESSIVE-NEXT:    ret i32 [[BASE_SHIFT]]
+;
+  %pre = and i32 %base, u0x00ffffff
+  %shl = shl i32 %pre, 8
+  %post = and i32 %shl, u0xffffff00
+  ret i32 %post
+}
+
+;; u0x0000ff00 = 65280
+define i32 @shl.1(i32 %base) {
+; LAZY-LABEL: define i32 @shl.1(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT:    [[PRE:%.*]] = and i32 [[BASE]], -16711936
+; LAZY-NEXT:    [[SHL:%.*]] = shl i32 [[PRE]], 8
+; LAZY-NEXT:    ret i32 [[SHL]]
+;
+; AGGRESSIVE-LABEL: define i32 @shl.1(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BASE_MASK:%.*]] = and i32 [[BASE]], 65280
+; AGGRESSIVE-NEXT:    [[BASE_SHIFT:%.*]] = shl i32 [[BASE_MASK]], 8
+; AGGRESSIVE-NEXT:    ret i32 [[BASE_SHIFT]]
+;
+  %pre = and i32 %base, u0xff00ff00
+  %shl = shl i32 %pre, 8
+  ret i32 %shl
+}
+
+;; u0x0fffffff = 268435455
+;; Nothing happens because it is not byte-aligned.
+define i32 @shl.2(i32 %base) {
+; LAZY-LABEL: define i32 @shl.2(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT:    [[PRE:%.*]] = and i32 [[BASE]], 268435455
+; LAZY-NEXT:    [[SHL:%.*]] = shl i32 [[PRE]], 4
+; LAZY-NEXT:    [[POST:%.*]] = and i32 [[SHL]], -16
+; LAZY-NEXT:    ret i32 [[POST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shl.2(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT:    [[PRE:%.*]] = and i32 [[BASE]], 268435455
+; AGGRESSIVE-NEXT:    [[SHL:%.*]] = shl i32 [[PRE]], 4
+; AGGRESSIVE-NEXT:    [[POST:%.*]] = and i32 [[SHL]], -16
+; AGGRESSIVE-NEXT:    ret i32 [[POST]]
+;
+  %pre = and i32 %base, u0x0fffffff
+  %shl = shl i32 %pre, 4
+  %post = and i32 %shl, u0xfffffff0
+  ret i32 %post
+}
+
+define <8 x i8> @shl.vec(<2 x i32> %base) {
+; LAZY-LABEL: define <8 x i8> @shl.vec(
+; LAZY-SAME: <2 x i32> [[BASE:%.*]]) {
+; LAZY-NEXT:    [[PRE:%.*]] = and <2 x i32> [[BASE]], <i32 -65281, i32 16777215>
+; LAZY-NEXT:    [[SHL:%.*]] = shl <2 x i32> [[PRE]], <i32 16, i32 8>
+; LAZY-NEXT:    [[POST:%.*]] = and <2 x i32> [[SHL]], splat (i32 -256)
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[POST]] to <8 x i8>
+; LAZY-NEXT:    ret <8 x i8> [[CAST]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @shl.vec(
+; AGGRESSIVE-SAME: <2 x i32> [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BASE_CAST:%.*]] = bitcast <2 x i32> [[BASE]] to <8 x i8>
+; AGGRESSIVE-NEXT:    [[BASE_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT:    [[BASE_SHUFFLE2:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> [[BASE_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 4, i32 5, i32 6>
+; AGGRESSIVE-NEXT:    ret <8 x i8> [[BASE_SHUFFLE2]]
+;
+  %pre = and <2 x i32> %base, <i32 u0xffff00ff, i32 u0x00ffffff>
+  %shl = shl <2 x i32> %pre, <i32 16, i32 8>
+  %post = and <2 x i32> %shl, splat(i32 u0xffffff00)
+  %cast = bitcast <2 x i32> %post to <8 x i8>
+  ret <8 x i8> %cast
+}
+
+define i32 @lshr.0(i32 %base) {
+; LAZY-LABEL: define i32 @lshr.0(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT:    [[PRE:%.*]] = and i32 [[BASE]], -256
+; LAZY-NEXT:    [[LSHR:%.*]] = lshr i32 [[PRE]], 8
+; LAZY-NEXT:    [[POST:%.*]] = and i32 [[LSHR]], 16777215
+; LAZY-NEXT:    ret i32 [[POST]]
+;
+; AGGRESSIVE-LABEL: define i32 @lshr.0(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BASE_SHIFT:%.*]] = lshr i32 [[BASE]], 8
+; AGGRESSIVE-NEXT:    ret i32 [[BASE_SHIFT]]
+;
+  %pre = and i32 %base, u0xffffff00
+  %lshr = lshr i32 %pre, 8
+  %post = and i32 %lshr, u0x00ffffff
+  ret i32 %post
+}
+
+;; u0x0000ff00 = 65280
+define i32 @lshr.1(i32 %base) {
+; LAZY-LABEL: define i32 @lshr.1(
+; LAZY-SAME: i32 [[BASE:%.*]]) {
+; LAZY-NEXT:    [[PRE:%.*]] = and i32 [[BASE]], 16711935
+; LAZY-NEXT:    [[LSHR:%.*]] = lshr i32 [[PRE]], 8
+; LAZY-NEXT:    ret i32 [[LSHR]]
+;
+; AGGRESSIVE-LABEL: define i32 @lshr.1(
+; AGGRESSIVE-SAME: i32 [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BASE_SHIFT:%.*]] = lshr i32 [[BASE]], 8
+; AGGRESSIVE-NEXT:    [[BASE_MASK:%.*]] = and i32 [[BASE_SHIFT]], 65280
+; AGGRESSIVE-NEXT:    ret i32 [[BASE_MASK]]
+;
+  %pre = and i32 %base, u0x00ff00ff
+  %lshr = lshr i32 %pre, 8
+  ret i32 %lshr
+}
+
+define <8 x i8> @lshr.vec(<2 x i32> %base) {
+; LAZY-LABEL: define <8 x i8> @lshr.vec(
+; LAZY-SAME: <2 x i32> [[BASE:%.*]]) {
+; LAZY-NEXT:    [[PRE:%.*]] = and <2 x i32> [[BASE]], <i32 -16711681, i32 -256>
+; LAZY-NEXT:    [[LSHR:%.*]] = lshr <2 x i32> [[PRE]], <i32 16, i32 8>
+; LAZY-NEXT:    [[POST:%.*]] = and <2 x i32> [[LSHR]], splat (i32 16777215)
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[POST]] to <8 x i8>
+; LAZY-NEXT:    ret <8 x i8> [[CAST]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @lshr.vec(
+; AGGRESSIVE-SAME: <2 x i32> [[BASE:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BASE_CAST:%.*]] = bitcast <2 x i32> [[BASE]] to <8 x i8>
+; AGGRESSIVE-NEXT:    [[BASE_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 3, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT:    [[BASE_SHUFFLE2:%.*]] = shufflevector <8 x i8> [[BASE_CAST]], <8 x i8> [[BASE_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 15>
+; AGGRESSIVE-NEXT:    ret <8 x i8> [[BASE_SHUFFLE2]]
+;
+  %pre = and <2 x i32> %base, <i32 u0xff00ffff, i32 u0xffffff00>
+  %lshr = lshr <2 x i32> %pre, <i32 16, i32 8>
+  %post = and <2 x i32> %lshr, splat(i32 u0x00ffffff)
+  %cast = bitcast <2 x i32> %post to <8 x i8>
+  ret <8 x i8> %cast
+}
+
+define i32 @trunc.0(i64 %src) {
+; LAZY-LABEL: define i32 @trunc.0(
+; LAZY-SAME: i64 [[SRC:%.*]]) {
+; LAZY-NEXT:    [[MASK:%.*]] = and i64 [[SRC]], 4294967295
+; LAZY-NEXT:    [[TRUNC:%.*]] = trunc i64 [[MASK]] to i32
+; LAZY-NEXT:    ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @trunc.0(
+; AGGRESSIVE-SAME: i64 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_CAST:%.*]] = trunc i64 [[SRC]] to i32
+; AGGRESSIVE-NEXT:    ret i32 [[SRC_CAST]]
+;
+  %mask = and i64 %src, u0x00000000ffffffff
+  %trunc = trunc i64 %mask to i32
+  ret i32 %trunc
+}
+
+;; u0xff00ff00 = -16711936
+define i32 @trunc.1(i64 %src) {
+; LAZY-LABEL: define i32 @trunc.1(
+; LAZY-SAME: i64 [[SRC:%.*]]) {
+; LAZY-NEXT:    [[MASK:%.*]] = and i64 [[SRC]], -71777214294589696
+; LAZY-NEXT:    [[TRUNC:%.*]] = trunc i64 [[MASK]] to i32
+; LAZY-NEXT:    ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @trunc.1(
+; AGGRESSIVE-SAME: i64 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_CAST:%.*]] = trunc i64 [[SRC]] to i32
+; AGGRESSIVE-NEXT:    [[SRC_MASK:%.*]] = and i32 [[SRC_CAST]], -16711936
+; AGGRESSIVE-NEXT:    ret i32 [[SRC_MASK]]
+;
+  %mask = and i64 %src, u0xff00ff00ff00ff00
+  %trunc = trunc i64 %mask to i32
+  ret i32 %trunc
+}
+
+define <4 x i8> @trunc.vec(<2 x i32> %src) {
+; LAZY-LABEL: define <4 x i8> @trunc.vec(
+; LAZY-SAME: <2 x i32> [[SRC:%.*]]) {
+; LAZY-NEXT:    [[TRUNC:%.*]] = trunc <2 x i32> [[SRC]] to <2 x i16>
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <2 x i16> [[TRUNC]] to <4 x i8>
+; LAZY-NEXT:    ret <4 x i8> [[CAST]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @trunc.vec(
+; AGGRESSIVE-SAME: <2 x i32> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_CAST:%.*]] = bitcast <2 x i32> [[SRC]] to <8 x i8>
+; AGGRESSIVE-NEXT:    [[SRC_EXTRACT:%.*]] = shufflevector <8 x i8> [[SRC_CAST]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT:    [[SRC_SHUFFLE:%.*]] = shufflevector <4 x i8> [[SRC_EXTRACT]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    [[SRC_EXTRACT2:%.*]] = shufflevector <8 x i8> [[SRC_CAST]], <8 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 4, i32 5>
+; AGGRESSIVE-NEXT:    [[SRC_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[SRC_EXTRACT2]], <4 x i8> [[SRC_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    ret <4 x i8> [[SRC_SHUFFLE4]]
+;
+  %trunc = trunc <2 x i32> %src to <2 x i16>
+  %cast = bitcast <2 x i16> %trunc to <4 x i8>
+  ret <4 x i8> %cast
+}
+
+define i32 @bitcast.0(i32 %x) {
+; LAZY-LABEL: define i32 @bitcast.0(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast i32 [[X]] to <4 x i8>
+; LAZY-NEXT:    [[BACK:%.*]] = bitcast <4 x i8> [[CAST]] to i32
+; LAZY-NEXT:    ret i32 [[BACK]]
+;
+; AGGRESSIVE-LABEL: define i32 @bitcast.0(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT:    ret i32 [[X]]
+;
+  %cast = bitcast i32 %x to <4 x i8>
+  %back = bitcast <4 x i8> %cast to i32
+  ret i32 %back
+}
+
+define i8 @extractelement.0(i32 %src) {
+; LAZY-LABEL: define i8 @extractelement.0(
+; LAZY-SAME: i32 [[SRC:%.*]]) {
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast i32 [[SRC]] to <4 x i8>
+; LAZY-NEXT:    [[ELT:%.*]] = extractelement <4 x i8> [[CAST]], i64 3
+; LAZY-NEXT:    ret i8 [[ELT]]
+;
+; AGGRESSIVE-LABEL: define i8 @extractelement.0(
+; AGGRESSIVE-SAME: i32 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_SHIFT:%.*]] = lshr i32 [[SRC]], 24
+; AGGRESSIVE-NEXT:    [[SRC_SHIFT_CAST:%.*]] = trunc i32 [[SRC_SHIFT]] to i8
+; AGGRESSIVE-NEXT:    ret i8 [[SRC_SHIFT_CAST]]
+;
+  %cast = bitcast i32 %src to <4 x i8>
+  %elt = extractelement <4 x i8> %cast, i64 3
+  ret i8 %elt
+}
+
+define i32 @insertelement.0(i8 %src) {
+; LAZY-LABEL: define i32 @insertelement.0(
+; LAZY-SAME: i8 [[SRC:%.*]]) {
+; LAZY-NEXT:    [[INSERT:%.*]] = insertelement <4 x i8> zeroinitializer, i8 [[SRC]], i64 3
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[INSERT]] to i32
+; LAZY-NEXT:    ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @insertelement.0(
+; AGGRESSIVE-SAME: i8 [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_CAST:%.*]] = zext i8 [[SRC]] to i32
+; AGGRESSIVE-NEXT:    [[SRC_SHIFT:%.*]] = shl i32 [[SRC_CAST]], 24
+; AGGRESSIVE-NEXT:    ret i32 [[SRC_SHIFT]]
+;
+  %insert = insertelement <4 x i8> zeroinitializer, i8 %src, i64 3
+  %cast = bitcast <4 x i8> %insert to i32
+  ret i32 %cast
+}
+
+define i32 @insertelement.1(i8 %a, i8 %b) {
+; LAZY-LABEL: define i32 @insertelement.1(
+; LAZY-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; LAZY-NEXT:    [[INSERT_A:%.*]] = insertelement <4 x i8> zeroinitializer, i8 [[A]], i64 3
+; LAZY-NEXT:    [[INSERT_B:%.*]] = insertelement <4 x i8> [[INSERT_A]], i8 [[B]], i64 1
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[INSERT_B]] to i32
+; LAZY-NEXT:    ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @insertelement.1(
+; AGGRESSIVE-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_CAST:%.*]] = zext i8 [[A]] to i32
+; AGGRESSIVE-NEXT:    [[B_CAST:%.*]] = zext i8 [[B]] to i32
+; AGGRESSIVE-NEXT:    [[B_SHIFT:%.*]] = shl i32 [[B_CAST]], 8
+; AGGRESSIVE-NEXT:    [[A_SHIFT:%.*]] = shl i32 [[A_CAST]], 24
+; AGGRESSIVE-NEXT:    [[CAST_MERGE:%.*]] = or disjoint i32 [[B_SHIFT]], [[A_SHIFT]]
+; AGGRESSIVE-NEXT:    ret i32 [[CAST_MERGE]]
+;
+  %insert.a = insertelement <4 x i8> zeroinitializer, i8 %a, i64 3
+  %insert.b = insertelement <4 x i8> %insert.a, i8 %b, i64 1
+  %cast = bitcast <4 x i8> %insert.b to i32
+  ret i32 %cast
+}
+
+define i64 @shufflevector.0(i32 %a, i32 %b) {
+; LAZY-LABEL: define i64 @shufflevector.0(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; LAZY-NEXT:    [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8>
+; LAZY-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> [[B_CAST]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <8 x i8> [[SHUFFLE]] to i64
+; LAZY-NEXT:    ret i64 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @shufflevector.0(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[B_CAST2:%.*]] = zext i32 [[B]] to i64
+; AGGRESSIVE-NEXT:    [[A_CAST1:%.*]] = zext i32 [[A]] to i64
+; AGGRESSIVE-NEXT:    [[B_SHIFT:%.*]] = shl i64 [[B_CAST2]], 32
+; AGGRESSIVE-NEXT:    [[CAST_MERGE:%.*]] = or disjoint i64 [[A_CAST1]], [[B_SHIFT]]
+; AGGRESSIVE-NEXT:    ret i64 [[CAST_MERGE]]
+;
+  %a.cast = bitcast i32 %a to <4 x i8>
+  %b.cast = bitcast i32 %b to <4 x i8>
+  %shuffle = shufflevector <4 x i8> %a.cast, <4 x i8> %b.cast, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cast = bitcast <8 x i8> %shuffle to i64
+  ret i64 %cast
+}
+
+define i32 @shufflevector.1(i32 %a, i32 %b) {
+; LAZY-LABEL: define i32 @shufflevector.1(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; LAZY-NEXT:    [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8>
+; LAZY-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> [[B_CAST]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[SHUFFLE]] to i32
+; LAZY-NEXT:    ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shufflevector.1(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], 16711935
+; AGGRESSIVE-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], -16711936
+; AGGRESSIVE-NEXT:    [[CAST_MERGE:%.*]] = or disjoint i32 [[A_MASK]], [[B_MASK]]
+; AGGRESSIVE-NEXT:    ret i32 [[CAST_MERGE]]
+;
+  %a.cast = bitcast i32 %a to <4 x i8>
+  %b.cast = bitcast i32 %b to <4 x i8>
+  %shuffle = shufflevector <4 x i8> %a.cast, <4 x i8> %b.cast, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %cast = bitcast <4 x i8> %shuffle to i32
+  ret i32 %cast
+}
+
+define i32 @shufflevector.2(i32 %a, i64 %b) {
+; LAZY-LABEL: define i32 @shufflevector.2(
+; LAZY-SAME: i32 [[A:%.*]], i64 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; LAZY-NEXT:    [[B_CAST:%.*]] = bitcast i64 [[B]] to <8 x i8>
+; LAZY-NEXT:    [[SHUFFLE_0:%.*]] = shufflevector <8 x i8> [[B_CAST]], <8 x i8> poison, <4 x i32> <i32 6, i32 7, i32 poison, i32 poison>
+; LAZY-NEXT:    [[SHUFFLE_1:%.*]] = shufflevector <4 x i8> [[SHUFFLE_0]], <4 x i8> [[A_CAST]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT:    [[CAST:%.*]] = bitcast <4 x i8> [[SHUFFLE_1]] to i32
+; LAZY-NEXT:    ret i32 [[CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @shufflevector.2(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i64 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[B_SHIFT:%.*]] = lshr i64 [[B]], 48
+; AGGRESSIVE-NEXT:    [[B_SHIFT_CAST:%.*]] = trunc i64 [[B_SHIFT]] to i32
+; AGGRESSIVE-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -65536
+; AGGRESSIVE-NEXT:    [[CAST_MERGE:%.*]] = or disjoint i32 [[B_SHIFT_CAST]], [[A_MASK]]
+; AGGRESSIVE-NEXT:    ret i32 [[CAST_MERGE]]
+;
+  %a.cast = bitcast i32 %a to <4 x i8>
+  %b.cast = bitcast i64 %b to <8 x i8>
+  %shuffle.0 = shufflevector <8 x i8> %b.cast, <8 x i8> poison, <4 x i32> <i32 6, i32 7, i32 poison, i32 poison>
+  %shuffle.1 = shufflevector <4 x i8> %shuffle.0, <4 x i8> %a.cast, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %cast = bitcast <4 x i8> %shuffle.1 to i32
+  ret i32 %cast
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
new file mode 100644
index 0000000000000..fe08ce93719d0
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define i16 @top_bytes(i32 %a, i32 %b) {
+; LAZY-LABEL: define i16 @top_bytes(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16777216
+; LAZY-NEXT:    [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 16
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], -16777216
+; LAZY-NEXT:    [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 24
+; LAZY-NEXT:    [[RES:%.*]] = or i32 [[A_LSHR]], [[B_LSHR]]
+; LAZY-NEXT:    [[TRUNC:%.*]] = trunc i32 [[RES]] to i16
+; LAZY-NEXT:    ret i16 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i16 @top_bytes(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[B_SHIFT:%.*]] = lshr i32 [[B]], 24
+; AGGRESSIVE-NEXT:    [[B_SHIFT_CAST:%.*]] = trunc i32 [[B_SHIFT]] to i16
+; AGGRESSIVE-NEXT:    [[A_SHIFT:%.*]] = lshr i32 [[A]], 16
+; AGGRESSIVE-NEXT:    [[A_SHIFT_CAST:%.*]] = trunc i32 [[A_SHIFT]] to i16
+; AGGRESSIVE-NEXT:    [[A_MASK2:%.*]] = and i16 [[A_SHIFT_CAST]], -256
+; AGGRESSIVE-NEXT:    [[TRUNC_MERGE:%.*]] = or disjoint i16 [[B_SHIFT_CAST]], [[A_MASK2]]
+; AGGRESSIVE-NEXT:    ret i16 [[TRUNC_MERGE]]
+;
+  %a.mask = and i32 %a, u0xff000000
+  %a.lshr = lshr i32 %a.mask, 16
+  %b.mask = and i32 %b, u0xff000000
+  %b.lshr = lshr i32 %b.mask, 24
+  %res = or i32 %a.lshr, %b.lshr
+  %trunc = trunc i32 %res to i16
+  ret i16 %trunc
+}
+
+define i32 @bottom_bytes(i16 %a, i16 %b) {
+; LAZY-LABEL: define i32 @bottom_bytes(
+; LAZY-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i16 [[A]], 255
+; LAZY-NEXT:    [[A_SHL:%.*]] = shl i16 [[A_MASK]], 8
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i16 [[B]], 255
+; LAZY-NEXT:    [[RES:%.*]] = or i16 [[A_SHL]], [[B_MASK]]
+; LAZY-NEXT:    [[ZEXT:%.*]] = zext i16 [[RES]] to i32
+; LAZY-NEXT:    [[SHL:%.*]] = shl i32 [[ZEXT]], 16
+; LAZY-NEXT:    ret i32 [[SHL]]
+;
+; AGGRESSIVE-LABEL: define i32 @bottom_bytes(
+; AGGRESSIVE-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_CAST:%.*]] = zext i16 [[A]] to i32
+; AGGRESSIVE-NEXT:    [[B_MASK2:%.*]] = and i16 [[B]], 255
+; AGGRESSIVE-NEXT:    [[B_MASK_CAST:%.*]] = zext i16 [[B_MASK2]] to i32
+; AGGRESSIVE-NEXT:    [[B_SHIFT:%.*]] = shl i32 [[B_MASK_CAST]], 16
+; AGGRESSIVE-NEXT:    [[A_SHIFT:%.*]] = shl i32 [[A_CAST]], 24
+; AGGRESSIVE-NEXT:    [[SHL_MERGE:%.*]] = or disjoint i32 [[B_SHIFT]], [[A_SHIFT]]
+; AGGRESSIVE-NEXT:    ret i32 [[SHL_MERGE]]
+;
+  %a.mask = and i16 %a, u0x00ff
+  %a.shl = shl i16 %a.mask, 8
+  %b.mask = and i16 %b, u0x00ff
+  %res = or i16 %a.shl, %b.mask
+  %zext = zext i16 %res to i32
+  %shl = shl i32 %zext, 16
+  ret i32 %shl
+}
+
+define i32 @obtain_i32(i32 %from) {
+; LAZY-LABEL: define i32 @obtain_i32(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT:    ret i32 [[FROM]]
+;
+; AGGRESSIVE-LABEL: define i32 @obtain_i32(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    ret i32 [[FROM]]
+;
+  %get.0 = and i32 %from, 255
+  %shr.1 = lshr i32 %from, 8
+  %mask.1 = and i32 %shr.1, 255
+  %get.1 = shl i32 %mask.1, 8
+  %out.1 = or i32 %get.0, %get.1
+
+  %shr.2 = lshr i32 %from, 16
+  %mask.2 = and i32 %shr.2, 255
+  %get.2 = shl i32 %mask.2, 16
+  %shr.3 = lshr i32 %from, 24
+  %mask.3 = and i32 %shr.3, 255
+  %get.3 = shl i32 %mask.3, 24
+  %out.2 = or i32 %get.2, %get.3
+
+  %out = or i32 %out.1, %out.2
+  ret i32 %out
+}
+
+;; u0xff00ffff = -16711681
+define i32 @obtain_i32_masked(i32 %from) {
+; LAZY-LABEL: define i32 @obtain_i32_masked(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_MASK:%.*]] = and i32 [[FROM]], -16711681
+; LAZY-NEXT:    ret i32 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @obtain_i32_masked(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_MASK:%.*]] = and i32 [[FROM]], -16711681
+; AGGRESSIVE-NEXT:    ret i32 [[FROM_MASK]]
+;
+  %get.0 = and i32 %from, 255
+  %shr.1 = lshr i32 %from, 8
+  %mask.1 = and i32 %shr.1, 255
+  %get.1 = shl i32 %mask.1, 8
+  %out.1 = or i32 %get.0, %get.1
+
+  %shr.3 = lshr i32 %from, 24
+  %mask.3 = and i32 %shr.3, 255
+  %get.3 = shl i32 %mask.3, 24
+  %out.2 = or i32 %out.1, %get.3
+
+  ret i32 %out.2
+}
+
+define i64 @obtain_i64(i64 %from) {
+; LAZY-LABEL: define i64 @obtain_i64(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_MASK:%.*]] = and i64 [[FROM]], 4294967295
+; LAZY-NEXT:    ret i64 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i64 @obtain_i64(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_MASK:%.*]] = and i64 [[FROM]], 4294967295
+; AGGRESSIVE-NEXT:    ret i64 [[FROM_MASK]]
+;
+  %mask.0 = and i64 %from, 255
+  %get.0 = shl i64 %mask.0, 0
+  %shr.1 = lshr i64 %from, 8
+  %mask.1 = and i64 %shr.1, 255
+  %get.1 = shl i64 %mask.1, 8
+  %out.1 = or i64 %get.0, %get.1
+
+  %shr.2 = lshr i64 %from, 16
+  %mask.2 = and i64 %shr.2, 255
+  %get.2 = shl i64 %mask.2, 16
+  %shr.3 = lshr i64 %from, 24
+  %mask.3 = and i64 %shr.3, 255
+  %get.3 = shl i64 %mask.3, 24
+  %out.2 = or i64 %get.2, %get.3
+
+  %out = or i64 %out.1, %out.2
+  ret i64 %out
+}
+
+define i64 @obtain_i64_shifted(i64 %from) {
+; LAZY-LABEL: define i64 @obtain_i64_shifted(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_SHIFT:%.*]] = shl i64 [[FROM]], 32
+; LAZY-NEXT:    ret i64 [[FROM_SHIFT]]
+;
+; AGGRESSIVE-LABEL: define i64 @obtain_i64_shifted(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_SHIFT:%.*]] = shl i64 [[FROM]], 32
+; AGGRESSIVE-NEXT:    ret i64 [[FROM_SHIFT]]
+;
+  %mask.0 = and i64 %from, 255
+  %get.0 = shl i64 %mask.0, 32
+  %shr.1 = lshr i64 %from, 8
+  %mask.1 = and i64 %shr.1, 255
+  %get.1 = shl i64 %mask.1, 40
+  %out.1 = or i64 %get.0, %get.1
+
+  %shr.2 = lshr i64 %from, 16
+  %mask.2 = and i64 %shr.2, 255
+  %get.2 = shl i64 %mask.2, 48
+  %shr.3 = lshr i64 %from, 24
+  %mask.3 = and i64 %shr.3, 255
+  %get.3 = shl i64 %mask.3, 56
+  %out.2 = or i64 %get.2, %get.3
+
+  %out = or i64 %out.1, %out.2
+  ret i64 %out
+}
+
+define i64 @obtain_i64_zext(i32 %from) {
+; LAZY-LABEL: define i64 @obtain_i64_zext(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = zext i32 [[FROM]] to i64
+; LAZY-NEXT:    [[FROM_SHIFT:%.*]] = shl i64 [[FROM_CAST]], 32
+; LAZY-NEXT:    ret i64 [[FROM_SHIFT]]
+;
+; AGGRESSIVE-LABEL: define i64 @obtain_i64_zext(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = zext i32 [[FROM]] to i64
+; AGGRESSIVE-NEXT:    [[FROM_SHIFT:%.*]] = shl i64 [[FROM_CAST]], 32
+; AGGRESSIVE-NEXT:    ret i64 [[FROM_SHIFT]]
+;
+  %mask.0 = and i32 %from, 255
+  %zext.0 = zext i32 %mask.0 to i64
+  %get.0 = shl i64 %zext.0, 32
+  %shr.1 = lshr i32 %from, 8
+  %mask.1 = and i32 %shr.1, 255
+  %zext.1 = zext i32 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 40
+  %out.1 = or i64 %get.0, %get.1
+
+  %shr.2 = lshr i32 %from, 16
+  %mask.2 = and i32 %shr.2, 255
+  %zext.2 = zext i32 %mask.2 to i64
+  %get.2 = shl i64 %zext.2, 48
+  %shr.3 = lshr i32 %from, 24
+  %mask.3 = and i32 %shr.3, 255
+  %zext.3 = zext i32 %mask.3 to i64
+  %get.3 = shl i64 %zext.3, 56
+  %out.2 = or i64 %get.2, %get.3
+
+  %out = or i64 %out.1, %out.2
+  ret i64 %out
+}
+
+define i64 @combine(i32 %bot, i32 %top) {
+; LAZY-LABEL: define i64 @combine(
+; LAZY-SAME: i32 [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; LAZY-NEXT:    [[TOP_CAST:%.*]] = zext i32 [[TOP]] to i64
+; LAZY-NEXT:    [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; LAZY-NEXT:    [[TOP_SHIFT:%.*]] = shl i64 [[TOP_CAST]], 32
+; LAZY-NEXT:    [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHIFT]]
+; LAZY-NEXT:    ret i64 [[OUT_3_MERGE]]
+;
+; AGGRESSIVE-LABEL: define i64 @combine(
+; AGGRESSIVE-SAME: i32 [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT:    [[TOP_CAST:%.*]] = zext i32 [[TOP]] to i64
+; AGGRESSIVE-NEXT:    [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; AGGRESSIVE-NEXT:    [[TOP_SHIFT:%.*]] = shl i64 [[TOP_CAST]], 32
+; AGGRESSIVE-NEXT:    [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHIFT]]
+; AGGRESSIVE-NEXT:    ret i64 [[OUT_3_MERGE]]
+;
+  %base = zext i32 %bot to i64
+
+  %mask.0 = and i32 %top, 255
+  %zext.0 = zext i32 %mask.0 to i64
+  %get.0 = shl i64 %zext.0, 32
+  %out.0 = or i64 %base, %get.0
+
+  %shr.1 = lshr i32 %top, 8
+  %mask.1 = and i32 %shr.1, 255
+  %zext.1 = zext i32 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 40
+  %out.1 = or i64 %out.0, %get.1
+
+  %shr.2 = lshr i32 %top, 16
+  %mask.2 = and i32 %shr.2, 255
+  %zext.2 = zext i32 %mask.2 to i64
+  %get.2 = shl i64 %zext.2, 48
+  %out.2 = or i64 %out.1, %get.2
+
+  %shr.3 = lshr i32 %top, 24
+  %mask.3 = and i32 %shr.3, 255
+  %zext.3 = zext i32 %mask.3 to i64
+  %get.3 = shl i64 %zext.3, 56
+  %out.3 = or i64 %out.2, %get.3
+
+  ret i64 %out.3
+}
+
+;; u0x0000ff00 = 65280
+;; u0x00ff0000 = 16711680
+;; u0xff000000 = -16777216
+;; u0xff0000ff = -16776961
+define i32 @shuffle_elts(i32 %x) {
+; LAZY-LABEL: define i32 @shuffle_elts(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT:    [[X_0:%.*]] = and i32 [[X]], 255
+; LAZY-NEXT:    [[X_1:%.*]] = and i32 [[X]], 65280
+; LAZY-NEXT:    [[X_2:%.*]] = and i32 [[X]], 16711680
+; LAZY-NEXT:    [[X_3:%.*]] = and i32 [[X]], -16777216
+; LAZY-NEXT:    [[SHL_1:%.*]] = shl i32 [[X_1]], 8
+; LAZY-NEXT:    [[OUT_1:%.*]] = or i32 [[X_0]], [[SHL_1]]
+; LAZY-NEXT:    [[SHR_2:%.*]] = lshr i32 [[X_2]], 8
+; LAZY-NEXT:    [[OUT_2:%.*]] = or i32 [[SHR_2]], [[X_3]]
+; LAZY-NEXT:    [[OUT:%.*]] = or i32 [[OUT_1]], [[OUT_2]]
+; LAZY-NEXT:    ret i32 [[OUT]]
+;
+; AGGRESSIVE-LABEL: define i32 @shuffle_elts(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT:    [[X_MASK:%.*]] = and i32 [[X]], -16776961
+; AGGRESSIVE-NEXT:    [[X_SHIFT:%.*]] = lshr i32 [[X]], 8
+; AGGRESSIVE-NEXT:    [[X_MASK2:%.*]] = and i32 [[X_SHIFT]], 65280
+; AGGRESSIVE-NEXT:    [[X_MASK4:%.*]] = and i32 [[X]], 65280
+; AGGRESSIVE-NEXT:    [[X_SHIFT6:%.*]] = shl i32 [[X_MASK4]], 8
+; AGGRESSIVE-NEXT:    [[OUT_MERGE:%.*]] = or disjoint i32 [[X_MASK]], [[X_MASK2]]
+; AGGRESSIVE-NEXT:    [[OUT_MERGE8:%.*]] = or disjoint i32 [[OUT_MERGE]], [[X_SHIFT6]]
+; AGGRESSIVE-NEXT:    ret i32 [[OUT_MERGE8]]
+;
+  %x.0 = and i32 %x, u0x000000ff
+  %x.1 = and i32 %x, u0x0000ff00
+  %x.2 = and i32 %x, u0x00ff0000
+  %x.3 = and i32 %x, u0xff000000
+
+  %shl.1 = shl i32 %x.1, 8
+  %out.1 = or i32 %x.0, %shl.1
+
+  %shr.2 = lshr i32 %x.2, 8
+  %out.2 = or i32 %shr.2, %x.3
+
+  %out = or i32 %out.1, %out.2
+
+  ret i32 %out
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
new file mode 100644
index 0000000000000..e4c1538826e0f
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
@@ -0,0 +1,393 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define <2 x i8> @top_bytes(i32 %a, i32 %b) {
+; LAZY-LABEL: define <2 x i8> @top_bytes(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16777216
+; LAZY-NEXT:    [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 24
+; LAZY-NEXT:    [[A_TRUNC:%.*]] = trunc i32 [[A_LSHR]] to i8
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], -16777216
+; LAZY-NEXT:    [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 24
+; LAZY-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B_LSHR]] to i8
+; LAZY-NEXT:    [[BYTES_0:%.*]] = insertelement <2 x i8> poison, i8 [[A_TRUNC]], i32 1
+; LAZY-NEXT:    [[BYTES_1:%.*]] = insertelement <2 x i8> [[BYTES_0]], i8 [[B_TRUNC]], i32 0
+; LAZY-NEXT:    ret <2 x i8> [[BYTES_1]]
+;
+; AGGRESSIVE-LABEL: define <2 x i8> @top_bytes(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_CAST:%.*]] = bitcast i32 [[A]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[B_CAST:%.*]] = bitcast i32 [[B]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[B_EXTRACT:%.*]] = shufflevector <4 x i8> [[B_CAST]], <4 x i8> poison, <2 x i32> <i32 3, i32 poison>
+; AGGRESSIVE-NEXT:    [[B_SHUFFLE:%.*]] = shufflevector <2 x i8> [[B_EXTRACT]], <2 x i8> zeroinitializer, <2 x i32> <i32 0, i32 3>
+; AGGRESSIVE-NEXT:    [[A_EXTRACT:%.*]] = shufflevector <4 x i8> [[A_CAST]], <4 x i8> poison, <2 x i32> <i32 poison, i32 3>
+; AGGRESSIVE-NEXT:    [[A_SHUFFLE:%.*]] = shufflevector <2 x i8> [[A_EXTRACT]], <2 x i8> [[B_SHUFFLE]], <2 x i32> <i32 2, i32 1>
+; AGGRESSIVE-NEXT:    ret <2 x i8> [[A_SHUFFLE]]
+;
+  %a.mask = and i32 %a, u0xff000000
+  %a.lshr = lshr i32 %a.mask, 24
+  %a.trunc = trunc i32 %a.lshr to i8
+  %b.mask = and i32 %b, u0xff000000
+  %b.lshr = lshr i32 %b.mask, 24
+  %b.trunc = trunc i32 %b.lshr to i8
+  %bytes.0 = insertelement <2 x i8> poison, i8 %a.trunc, i32 1
+  %bytes.1 = insertelement <2 x i8> %bytes.0, i8 %b.trunc, i32 0
+  ret <2 x i8> %bytes.1
+}
+
+define <2 x i16> @top_bytes.i16(i32 %a, i32 %b) {
+; LAZY-LABEL: define <2 x i16> @top_bytes.i16(
+; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -65536
+; LAZY-NEXT:    [[A_LSHR:%.*]] = lshr i32 [[A_MASK]], 16
+; LAZY-NEXT:    [[A_TRUNC:%.*]] = trunc i32 [[A_LSHR]] to i16
+; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], -65536
+; LAZY-NEXT:    [[B_LSHR:%.*]] = lshr i32 [[B_MASK]], 16
+; LAZY-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B_LSHR]] to i16
+; LAZY-NEXT:    [[BYTES_0:%.*]] = insertelement <2 x i16> poison, i16 [[A_TRUNC]], i32 1
+; LAZY-NEXT:    [[BYTES_1:%.*]] = insertelement <2 x i16> [[BYTES_0]], i16 [[B_TRUNC]], i32 0
+; LAZY-NEXT:    ret <2 x i16> [[BYTES_1]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @top_bytes.i16(
+; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; AGGRESSIVE-NEXT:    [[A_CAST:%.*]] = bitcast i32 [[A]] to <2 x i16>
+; AGGRESSIVE-NEXT:    [[B_CAST:%.*]] = bitcast i32 [[B]] to <2 x i16>
+; AGGRESSIVE-NEXT:    [[B_SHUFFLE:%.*]] = shufflevector <2 x i16> [[B_CAST]], <2 x i16> zeroinitializer, <2 x i32> <i32 1, i32 3>
+; AGGRESSIVE-NEXT:    [[A_SHUFFLE:%.*]] = shufflevector <2 x i16> [[A_CAST]], <2 x i16> [[B_SHUFFLE]], <2 x i32> <i32 2, i32 1>
+; AGGRESSIVE-NEXT:    ret <2 x i16> [[A_SHUFFLE]]
+;
+  %a.mask = and i32 %a, u0xffff0000
+  %a.lshr = lshr i32 %a.mask, 16
+  %a.trunc = trunc i32 %a.lshr to i16
+  %b.mask = and i32 %b, u0xffff0000
+  %b.lshr = lshr i32 %b.mask, 16
+  %b.trunc = trunc i32 %b.lshr to i16
+  %bytes.0 = insertelement <2 x i16> poison, i16 %a.trunc, i32 1
+  %bytes.1 = insertelement <2 x i16> %bytes.0, i16 %b.trunc, i32 0
+  ret <2 x i16> %bytes.1
+}
+
+define <4 x i8> @obtain_i32(i32 %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_i32(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; LAZY-NEXT:    ret <4 x i8> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_i32(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT:    ret <4 x i8> [[FROM_CAST]]
+;
+  %get.0 = trunc i32 %from to i8
+
+  %shr.1 = lshr i32 %from, 8
+  %get.1 = trunc i32 %shr.1 to i8
+
+  %shr.2 = lshr i32 %from, 16
+  %get.2 = trunc i32 %shr.2 to i8
+
+  %shr.3 = lshr i32 %from, 24
+  %get.3 = trunc i32 %shr.3 to i8
+
+  %build.0 = insertelement <4 x i8> poison, i8 %get.0, i32 0
+  %build.1 = insertelement <4 x i8> %build.0, i8 %get.1, i32 1
+  %build.2 = insertelement <4 x i8> %build.1, i8 %get.2, i32 2
+  %build.3 = insertelement <4 x i8> %build.2, i8 %get.3, i32 3
+  ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_i32.i16(i32 %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_i32.i16(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; LAZY-NEXT:    ret <2 x i16> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_i32.i16(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT:    ret <2 x i16> [[FROM_CAST]]
+;
+  %get.0 = trunc i32 %from to i16
+
+  %shr.1 = lshr i32 %from, 16
+  %get.1 = trunc i32 %shr.1 to i16
+
+  %build.0 = insertelement <2 x i16> poison, i16 %get.0, i32 0
+  %build.1 = insertelement <2 x i16> %build.0, i16 %get.1, i32 1
+  ret <2 x i16> %build.1
+}
+
+define <4 x i8> @obtain_i32_masked(i32 %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_i32_masked(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; LAZY-NEXT:    ret <4 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_i32_masked(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; AGGRESSIVE-NEXT:    ret <4 x i8> [[FROM_SHUFFLE]]
+;
+  %get.0 = trunc i32 %from to i8
+
+  %shr.1 = lshr i32 %from, 8
+  %get.1 = trunc i32 %shr.1 to i8
+
+  %shr.3 = lshr i32 %from, 24
+  %get.3 = trunc i32 %shr.3 to i8
+
+  %build.0 = insertelement <4 x i8> <i8 poison, i8 poison, i8 0, i8 poison>, i8 %get.0, i32 0
+  %build.1 = insertelement <4 x i8> %build.0, i8 %get.1, i32 1
+  %build.3 = insertelement <4 x i8> %build.1, i8 %get.3, i32 3
+  ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_i32_masked.i16(i32 %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_i32_masked.i16(
+; LAZY-SAME: i32 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; LAZY-NEXT:    [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; LAZY-NEXT:    ret <2 x i16> [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_i32_masked.i16(
+; AGGRESSIVE-SAME: i32 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i32 [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT:    [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; AGGRESSIVE-NEXT:    ret <2 x i16> [[FROM_MASK]]
+;
+  %get.0 = trunc i32 %from to i16
+
+  %shr.1 = lshr i32 %from, 16
+  %trunc.1 = trunc i32 %shr.1 to i16
+  %get.1 = and i16 %trunc.1, u0xff00
+
+  %build.0 = insertelement <2 x i16> poison, i16 %get.0, i32 0
+  %build.1 = insertelement <2 x i16> %build.0, i16 %get.1, i32 1
+  ret <2 x i16> %build.1
+}
+
+define <8 x i8> @obtain_i64(i64 %from) {
+; LAZY-LABEL: define <8 x i8> @obtain_i64(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; LAZY-NEXT:    ret <8 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @obtain_i64(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT:    ret <8 x i8> [[FROM_SHUFFLE]]
+;
+  %get.0 = trunc i64 %from to i8
+
+  %shr.1 = lshr i64 %from, 8
+  %get.1 = trunc i64 %shr.1 to i8
+
+  %shr.2 = lshr i64 %from, 16
+  %get.2 = trunc i64 %shr.2 to i8
+
+  %shr.3 = lshr i64 %from, 24
+  %get.3 = trunc i64 %shr.3 to i8
+
+  %build.0 = insertelement <8 x i8> zeroinitializer, i8 %get.0, i32 0
+  %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 1
+  %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 2
+  %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 3
+  ret <8 x i8> %build.3
+}
+
+define <4 x i16> @obtain_i64.i16(i64 %from) {
+; LAZY-LABEL: define <4 x i16> @obtain_i64.i16(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT:    ret <4 x i16> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @obtain_i64.i16(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    ret <4 x i16> [[FROM_SHUFFLE]]
+;
+  %get.0 = trunc i64 %from to i16
+
+  %shr.1 = lshr i64 %from, 16
+  %get.1 = trunc i64 %shr.1 to i16
+
+  %build.0 = insertelement <4 x i16> zeroinitializer, i16 %get.0, i32 0
+  %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 1
+  ret <4 x i16> %build.1
+}
+
+define <8 x i8> @obtain_i64_shifted(i64 %from) {
+; LAZY-LABEL: define <8 x i8> @obtain_i64_shifted(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT:    ret <8 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @obtain_i64_shifted(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <8 x i8>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_CAST]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    ret <8 x i8> [[FROM_SHUFFLE]]
+;
+  %get.0 = trunc i64 %from to i8
+
+  %shr.1 = lshr i64 %from, 8
+  %get.1 = trunc i64 %shr.1 to i8
+
+  %shr.2 = lshr i64 %from, 16
+  %get.2 = trunc i64 %shr.2 to i8
+
+  %shr.3 = lshr i64 %from, 24
+  %get.3 = trunc i64 %shr.3 to i8
+
+  %build.0 = insertelement <8 x i8> zeroinitializer, i8 %get.0, i32 4
+  %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 5
+  %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 6
+  %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 7
+  ret <8 x i8> %build.3
+}
+
+define <4 x i16> @obtain_i64_shifted.i16(i64 %from) {
+; LAZY-LABEL: define <4 x i16> @obtain_i64_shifted.i16(
+; LAZY-SAME: i64 [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; LAZY-NEXT:    ret <4 x i16> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @obtain_i64_shifted.i16(
+; AGGRESSIVE-SAME: i64 [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast i64 [[FROM]] to <4 x i16>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_CAST]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; AGGRESSIVE-NEXT:    ret <4 x i16> [[FROM_SHUFFLE]]
+;
+  %get.0 = trunc i64 %from to i16
+
+  %shr.1 = lshr i64 %from, 16
+  %get.1 = trunc i64 %shr.1 to i16
+
+  %build.0 = insertelement <4 x i16> zeroinitializer, i16 %get.0, i32 2
+  %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 3
+  ret <4 x i16> %build.1
+}
+
+define <8 x i8> @combine(<4 x i8> %bot, i32 %top) {
+; LAZY-LABEL: define <8 x i8> @combine(
+; LAZY-SAME: <4 x i8> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; LAZY-NEXT:    [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <4 x i8>
+; LAZY-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; LAZY-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; LAZY-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT:    ret <8 x i8> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @combine(
+; AGGRESSIVE-SAME: <4 x i8> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT:    [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    ret <8 x i8> [[TOP_SHUFFLE]]
+;
+  %base = shufflevector <4 x i8> %bot, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+
+  %get.0 = trunc i32 %top to i8
+
+  %shr.1 = lshr i32 %top, 8
+  %get.1 = trunc i32 %shr.1 to i8
+
+  %shr.2 = lshr i32 %top, 16
+  %get.2 = trunc i32 %shr.2 to i8
+
+  %shr.3 = lshr i32 %top, 24
+  %get.3 = trunc i32 %shr.3 to i8
+
+  %build.0 = insertelement <8 x i8> %base, i8 %get.0, i32 4
+  %build.1 = insertelement <8 x i8> %build.0, i8 %get.1, i32 5
+  %build.2 = insertelement <8 x i8> %build.1, i8 %get.2, i32 6
+  %build.3 = insertelement <8 x i8> %build.2, i8 %get.3, i32 7
+
+  ret <8 x i8> %build.3
+}
+
+define <4 x i16> @combine.i16(<2 x i16> %bot, i32 %top) {
+; LAZY-LABEL: define <4 x i16> @combine.i16(
+; LAZY-SAME: <2 x i16> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; LAZY-NEXT:    [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <2 x i16>
+; LAZY-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; LAZY-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; LAZY-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; LAZY-NEXT:    ret <4 x i16> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @combine.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[BOT:%.*]], i32 [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT:    [[TOP_CAST:%.*]] = bitcast i32 [[TOP]] to <2 x i16>
+; AGGRESSIVE-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    ret <4 x i16> [[TOP_SHUFFLE]]
+;
+  %base = shufflevector <2 x i16> %bot, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+
+  %get.0 = trunc i32 %top to i16
+
+  %shr.1 = lshr i32 %top, 16
+  %get.1 = trunc i32 %shr.1 to i16
+
+  %build.0 = insertelement <4 x i16> %base, i16 %get.0, i32 2
+  %build.1 = insertelement <4 x i16> %build.0, i16 %get.1, i32 3
+
+  ret <4 x i16> %build.1
+}
+
+define <4 x i8> @shuffle_elts(i32 %x) {
+; LAZY-LABEL: define <4 x i8> @shuffle_elts(
+; LAZY-SAME: i32 [[X:%.*]]) {
+; LAZY-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i8
+; LAZY-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; LAZY-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; LAZY-NEXT:    [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; LAZY-NEXT:    [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; LAZY-NEXT:    [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; LAZY-NEXT:    [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; LAZY-NEXT:    [[BUILD_0:%.*]] = insertelement <4 x i8> poison, i8 [[X_0]], i32 0
+; LAZY-NEXT:    [[BUILD_1:%.*]] = insertelement <4 x i8> [[BUILD_0]], i8 [[X_1]], i32 2
+; LAZY-NEXT:    [[BUILD_2:%.*]] = insertelement <4 x i8> [[BUILD_1]], i8 [[X_2]], i32 1
+; LAZY-NEXT:    [[X_SHUFFLE4:%.*]] = insertelement <4 x i8> [[BUILD_2]], i8 [[X_3]], i32 3
+; LAZY-NEXT:    ret <4 x i8> [[X_SHUFFLE4]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @shuffle_elts(
+; AGGRESSIVE-SAME: i32 [[X:%.*]]) {
+; AGGRESSIVE-NEXT:    [[X_CAST:%.*]] = bitcast i32 [[X]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[X_SHUFFLE:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AGGRESSIVE-NEXT:    [[X_SHUFFLE2:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> [[X_SHUFFLE]], <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    [[X_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[X_CAST]], <4 x i8> [[X_SHUFFLE2]], <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+; AGGRESSIVE-NEXT:    ret <4 x i8> [[X_SHUFFLE4]]
+;
+  %x.0 = trunc i32 %x to i8
+
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+
+  %build.0 = insertelement <4 x i8> poison, i8 %x.0, i32 0
+  %build.1 = insertelement <4 x i8> %build.0, i8 %x.1, i32 2
+  %build.2 = insertelement <4 x i8> %build.1, i8 %x.2, i32 1
+  %build.3 = insertelement <4 x i8> %build.2, i8 %x.3, i32 3
+
+  ret <4 x i8> %build.3
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
new file mode 100644
index 0000000000000..22ee7fcf5b4c6
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
@@ -0,0 +1,480 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define i32 @extract_i32(<4 x i8> %from) {
+; LAZY-LABEL: define i32 @extract_i32(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; LAZY-NEXT:    ret i32 [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; AGGRESSIVE-NEXT:    ret i32 [[FROM_CAST]]
+;
+  %mask.0 = extractelement <4 x i8> %from, i64 0
+  %get.0 = zext i8 %mask.0 to i32
+
+  %mask.1 = extractelement <4 x i8> %from, i64 1
+  %zext.1 = zext i8 %mask.1 to i32
+  %get.1 = shl i32 %zext.1, 8
+  %out.1 = or i32 %get.0, %get.1
+
+  %mask.2 = extractelement <4 x i8> %from, i64 2
+  %zext.2 = zext i8 %mask.2 to i32
+  %get.2 = shl i32 %zext.2, 16
+
+  %mask.3 = extractelement <4 x i8> %from, i64 3
+  %zext.3 = zext i8 %mask.3 to i32
+  %get.3 = shl i32 %zext.3, 24
+  %out.2 = or i32 %get.2, %get.3
+
+  %out = or i32 %out.1, %out.2
+  ret i32 %out
+}
+
+define i32 @extract_i32.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i32 @extract_i32.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; LAZY-NEXT:    ret i32 [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; AGGRESSIVE-NEXT:    ret i32 [[FROM_CAST]]
+;
+  %mask.0 = extractelement <2 x i16> %from, i64 0
+  %get.0 = zext i16 %mask.0 to i32
+
+  %mask.1 = extractelement <2 x i16> %from, i64 1
+  %zext.1 = zext i16 %mask.1 to i32
+  %get.1 = shl i32 %zext.1, 16
+
+  %out = or i32 %get.0, %get.1
+  ret i32 %out
+}
+
+define i32 @extract_i32_lower(<8 x i8> %from) {
+; LAZY-LABEL: define i32 @extract_i32_lower(
+; LAZY-SAME: <8 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM]], <8 x i8> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i8> [[FROM_SHUFFLE]] to i32
+; LAZY-NEXT:    ret i32 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_lower(
+; AGGRESSIVE-SAME: <8 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM]], <8 x i8> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i8> [[FROM_SHUFFLE]] to i32
+; AGGRESSIVE-NEXT:    ret i32 [[FROM_SHUFFLE_CAST]]
+;
+  %mask.0 = extractelement <8 x i8> %from, i64 4
+  %get.0 = zext i8 %mask.0 to i32
+
+  %mask.1 = extractelement <8 x i8> %from, i64 5
+  %zext.1 = zext i8 %mask.1 to i32
+  %get.1 = shl i32 %zext.1, 8
+  %out.1 = or i32 %get.0, %get.1
+
+  %mask.2 = extractelement <8 x i8> %from, i64 6
+  %zext.2 = zext i8 %mask.2 to i32
+  %get.2 = shl i32 %zext.2, 16
+
+  %mask.3 = extractelement <8 x i8> %from, i64 7
+  %zext.3 = zext i8 %mask.3 to i32
+  %get.3 = shl i32 %zext.3, 24
+  %out.2 = or i32 %get.2, %get.3
+
+  %out = or i32 %out.1, %out.2
+  ret i32 %out
+}
+
+define i32 @extract_i32_lower.i16(<4 x i16> %from) {
+; LAZY-LABEL: define i32 @extract_i32_lower.i16(
+; LAZY-SAME: <4 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM]], <4 x i16> zeroinitializer, <2 x i32> <i32 2, i32 3>
+; LAZY-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <2 x i16> [[FROM_SHUFFLE]] to i32
+; LAZY-NEXT:    ret i32 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_lower.i16(
+; AGGRESSIVE-SAME: <4 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM]], <4 x i16> zeroinitializer, <2 x i32> <i32 2, i32 3>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <2 x i16> [[FROM_SHUFFLE]] to i32
+; AGGRESSIVE-NEXT:    ret i32 [[FROM_SHUFFLE_CAST]]
+;
+  %mask.0 = extractelement <4 x i16> %from, i64 2
+  %get.0 = zext i16 %mask.0 to i32
+
+  %mask.1 = extractelement <4 x i16> %from, i64 3
+  %zext.1 = zext i16 %mask.1 to i32
+  %get.1 = shl i32 %zext.1, 16
+
+  %out = or i32 %get.0, %get.1
+  ret i32 %out
+}
+
+;; u0xff00ffff = -16711681
+define i32 @extract_i32_masked(<4 x i8> %from) {
+; LAZY-LABEL: define i32 @extract_i32_masked(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; LAZY-NEXT:    [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; LAZY-NEXT:    ret i32 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_masked(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to i32
+; AGGRESSIVE-NEXT:    [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; AGGRESSIVE-NEXT:    ret i32 [[FROM_MASK]]
+;
+  %mask.0 = extractelement <4 x i8> %from, i64 0
+  %get.0 = zext i8 %mask.0 to i32
+
+  %mask.1 = extractelement <4 x i8> %from, i64 1
+  %zext.1 = zext i8 %mask.1 to i32
+  %get.1 = shl i32 %zext.1, 8
+  %out.1 = or i32 %get.0, %get.1
+
+  %mask.3 = extractelement <4 x i8> %from, i64 3
+  %zext.3 = zext i8 %mask.3 to i32
+  %get.3 = shl i32 %zext.3, 24
+  %out.2 = or i32 %out.1, %get.3
+
+  ret i32 %out.2
+}
+
+;; u0xff00ffff = -16711681
+define i32 @extract_i32_masked.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i32 @extract_i32_masked.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; LAZY-NEXT:    [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; LAZY-NEXT:    ret i32 [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_i32_masked.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to i32
+; AGGRESSIVE-NEXT:    [[FROM_MASK:%.*]] = and i32 [[FROM_CAST]], -16711681
+; AGGRESSIVE-NEXT:    ret i32 [[FROM_MASK]]
+;
+  %mask.0 = extractelement <2 x i16> %from, i64 0
+  %get.0 = zext i16 %mask.0 to i32
+
+  %mask.1 = extractelement <2 x i16> %from, i64 1
+  %mask.1.1 = and i16 %mask.1, u0xff00
+  %zext.1 = zext i16 %mask.1.1 to i32
+  %get.1 = shl i32 %zext.1, 16
+
+  %out = or i32 %get.0, %get.1
+  ret i32 %out
+}
+
+define i64 @extract_i64(<4 x i8> %from) {
+; LAZY-LABEL: define i64 @extract_i64(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; LAZY-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+  %mask.0 = extractelement <4 x i8> %from, i64 0
+  %get.0 = zext i8 %mask.0 to i64
+
+  %mask.1 = extractelement <4 x i8> %from, i64 1
+  %zext.1 = zext i8 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 8
+  %out.1 = or i64 %get.0, %get.1
+
+  %mask.2 = extractelement <4 x i8> %from, i64 2
+  %zext.2 = zext i8 %mask.2 to i64
+  %get.2 = shl i64 %zext.2, 16
+
+  %mask.3 = extractelement <4 x i8> %from, i64 3
+  %zext.3 = zext i8 %mask.3 to i64
+  %get.3 = shl i64 %zext.3, 24
+  %out.2 = or i64 %get.2, %get.3
+
+  %out = or i64 %out.1, %out.2
+  ret i64 %out
+}
+
+define i64 @extract_i64.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i64 @extract_i64.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; LAZY-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+  %mask.0 = extractelement <2 x i16> %from, i64 0
+  %get.0 = zext i16 %mask.0 to i64
+
+  %mask.1 = extractelement <2 x i16> %from, i64 1
+  %zext.1 = zext i16 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 16
+
+  %out = or i64 %get.0, %get.1
+  ret i64 %out
+}
+
+define i64 @extract_i64_shifted(<4 x i8> %from) {
+; LAZY-LABEL: define i64 @extract_i64_shifted(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64_shifted(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+  %mask.0 = extractelement <4 x i8> %from, i64 0
+  %zext.0 = zext i8 %mask.0 to i64
+  %get.0 = shl i64 %zext.0, 32
+
+  %mask.1 = extractelement <4 x i8> %from, i64 1
+  %zext.1 = zext i8 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 40
+  %out.1 = or i64 %get.0, %get.1
+
+  %mask.2 = extractelement <4 x i8> %from, i64 2
+  %zext.2 = zext i8 %mask.2 to i64
+  %get.2 = shl i64 %zext.2, 48
+
+  %mask.3 = extractelement <4 x i8> %from, i64 3
+  %zext.3 = zext i8 %mask.3 to i64
+  %get.3 = shl i64 %zext.3, 56
+  %out.2 = or i64 %get.2, %get.3
+
+  %out = or i64 %out.1, %out.2
+  ret i64 %out
+}
+
+define i64 @extract_i64_shifted.i16(<2 x i16> %from) {
+; LAZY-LABEL: define i64 @extract_i64_shifted.i16(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; LAZY-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; LAZY-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_i64_shifted.i16(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <2 x i16> [[FROM]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[FROM_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT:    ret i64 [[FROM_SHUFFLE_CAST]]
+;
+  %mask.0 = extractelement <2 x i16> %from, i64 0
+  %zext.0 = zext i16 %mask.0 to i64
+  %get.0 = shl i64 %zext.0, 32
+
+  %mask.1 = extractelement <2 x i16> %from, i64 1
+  %zext.1 = zext i16 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 48
+
+  %out = or i64 %get.0, %get.1
+  ret i64 %out
+}
+
+define i64 @extract_combine(i32 %bot, <4 x i8> %top) {
+; LAZY-LABEL: define i64 @extract_combine(
+; LAZY-SAME: i32 [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; LAZY-NEXT:    [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; LAZY-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i8> [[TOP]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT:    [[TOP_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[TOP_SHUFFLE]] to i64
+; LAZY-NEXT:    [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; LAZY-NEXT:    ret i64 [[OUT_3_MERGE]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_combine(
+; AGGRESSIVE-SAME: i32 [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i8> [[TOP]], <4 x i8> zeroinitializer, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE_CAST:%.*]] = bitcast <8 x i8> [[TOP_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT:    [[OUT_3_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; AGGRESSIVE-NEXT:    ret i64 [[OUT_3_MERGE]]
+;
+  %base = zext i32 %bot to i64
+
+  %mask.0 = extractelement <4 x i8> %top, i64 0
+  %zext.0 = zext i8 %mask.0 to i64
+  %get.0 = shl i64 %zext.0, 32
+  %out.0 = or i64 %base, %get.0
+
+  %mask.1 = extractelement <4 x i8> %top, i64 1
+  %zext.1 = zext i8 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 40
+  %out.1 = or i64 %out.0, %get.1
+
+  %mask.2 = extractelement <4 x i8> %top, i64 2
+  %zext.2 = zext i8 %mask.2 to i64
+  %get.2 = shl i64 %zext.2, 48
+  %out.2 = or i64 %out.1, %get.2
+
+  %mask.3 = extractelement <4 x i8> %top, i64 3
+  %zext.3 = zext i8 %mask.3 to i64
+  %get.3 = shl i64 %zext.3, 56
+  %out.3 = or i64 %out.2, %get.3
+
+  ret i64 %out.3
+}
+
+define i64 @extract_combine.i16(i32 %bot, <2 x i16> %top) {
+; LAZY-LABEL: define i64 @extract_combine.i16(
+; LAZY-SAME: i32 [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; LAZY-NEXT:    [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; LAZY-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <2 x i16> [[TOP]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; LAZY-NEXT:    [[TOP_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[TOP_SHUFFLE]] to i64
+; LAZY-NEXT:    [[OUT_1_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; LAZY-NEXT:    ret i64 [[OUT_1_MERGE]]
+;
+; AGGRESSIVE-LABEL: define i64 @extract_combine.i16(
+; AGGRESSIVE-SAME: i32 [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT:    [[BOT_CAST:%.*]] = zext i32 [[BOT]] to i64
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <2 x i16> [[TOP]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE_CAST:%.*]] = bitcast <4 x i16> [[TOP_SHUFFLE]] to i64
+; AGGRESSIVE-NEXT:    [[OUT_1_MERGE:%.*]] = or disjoint i64 [[BOT_CAST]], [[TOP_SHUFFLE_CAST]]
+; AGGRESSIVE-NEXT:    ret i64 [[OUT_1_MERGE]]
+;
+  %base = zext i32 %bot to i64
+
+  %mask.0 = extractelement <2 x i16> %top, i64 0
+  %zext.0 = zext i16 %mask.0 to i64
+  %get.0 = shl i64 %zext.0, 32
+  %out.0 = or i64 %base, %get.0
+
+  %mask.1 = extractelement <2 x i16> %top, i64 1
+  %zext.1 = zext i16 %mask.1 to i64
+  %get.1 = shl i64 %zext.1, 48
+  %out.1 = or i64 %out.0, %get.1
+
+  ret i64 %out.1
+}
+
+define i32 @extract_bigelt.0(<4 x i64> %src) {
+; LAZY-LABEL: define i32 @extract_bigelt.0(
+; LAZY-SAME: <4 x i64> [[SRC:%.*]]) {
+; LAZY-NEXT:    [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; LAZY-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SRC_0]] to i32
+; LAZY-NEXT:    ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_bigelt.0(
+; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_CAST:%.*]] = bitcast <4 x i64> [[SRC]] to <8 x i32>
+; AGGRESSIVE-NEXT:    [[SRC_EXTRACT:%.*]] = extractelement <8 x i32> [[SRC_CAST]], i32 6
+; AGGRESSIVE-NEXT:    ret i32 [[SRC_EXTRACT]]
+;
+  %src.0 = extractelement <4 x i64> %src, i64 3
+  %trunc = trunc i64 %src.0 to i32
+  ret i32 %trunc
+}
+
+define i32 @extract_bigelt.1(<4 x i64> %src) {
+; LAZY-LABEL: define i32 @extract_bigelt.1(
+; LAZY-SAME: <4 x i64> [[SRC:%.*]]) {
+; LAZY-NEXT:    [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; LAZY-NEXT:    [[SHR:%.*]] = lshr i64 [[SRC_0]], 32
+; LAZY-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32
+; LAZY-NEXT:    ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_bigelt.1(
+; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_CAST:%.*]] = bitcast <4 x i64> [[SRC]] to <8 x i32>
+; AGGRESSIVE-NEXT:    [[SRC_EXTRACT:%.*]] = extractelement <8 x i32> [[SRC_CAST]], i32 7
+; AGGRESSIVE-NEXT:    ret i32 [[SRC_EXTRACT]]
+;
+  %src.0 = extractelement <4 x i64> %src, i64 3
+  %shr = lshr i64 %src.0, 32
+  %trunc = trunc i64 %shr to i32
+  ret i32 %trunc
+}
+
+;; Nothing happens because the shift amount is too small.
+define i32 @extract_bigelt.2(<4 x i64> %src) {
+; LAZY-LABEL: define i32 @extract_bigelt.2(
+; LAZY-SAME: <4 x i64> [[SRC:%.*]]) {
+; LAZY-NEXT:    [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; LAZY-NEXT:    [[SHR:%.*]] = lshr i64 [[SRC_0]], 16
+; LAZY-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32
+; LAZY-NEXT:    ret i32 [[TRUNC]]
+;
+; AGGRESSIVE-LABEL: define i32 @extract_bigelt.2(
+; AGGRESSIVE-SAME: <4 x i64> [[SRC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[SRC_0:%.*]] = extractelement <4 x i64> [[SRC]], i64 3
+; AGGRESSIVE-NEXT:    [[SHR:%.*]] = lshr i64 [[SRC_0]], 16
+; AGGRESSIVE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHR]] to i32
+; AGGRESSIVE-NEXT:    ret i32 [[TRUNC]]
+;
+  %src.0 = extractelement <4 x i64> %src, i64 3
+  %shr = lshr i64 %src.0, 16
+  %trunc = trunc i64 %shr to i32
+  ret i32 %trunc
+}
+
+;; u0x0000ff00 = 65280
+;; u0x00ff0000 = 16711680
+;; u0xff0000ff = -16776961
+define i32 @shuffle_elts(<4 x i8> %vec) {
+; LAZY-LABEL: define i32 @shuffle_elts(
+; LAZY-SAME: <4 x i8> [[VEC:%.*]]) {
+; LAZY-NEXT:    [[VEC_0:%.*]] = extractelement <4 x i8> [[VEC]], i32 0
+; LAZY-NEXT:    [[VEC_1:%.*]] = extractelement <4 x i8> [[VEC]], i32 1
+; LAZY-NEXT:    [[VEC_2:%.*]] = extractelement <4 x i8> [[VEC]], i32 2
+; LAZY-NEXT:    [[VEC_3:%.*]] = extractelement <4 x i8> [[VEC]], i32 3
+; LAZY-NEXT:    [[ZEXT_0:%.*]] = zext i8 [[VEC_0]] to i32
+; LAZY-NEXT:    [[ZEXT_1:%.*]] = zext i8 [[VEC_1]] to i32
+; LAZY-NEXT:    [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
+; LAZY-NEXT:    [[OUT_1:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
+; LAZY-NEXT:    [[ZEXT_2:%.*]] = zext i8 [[VEC_2]] to i32
+; LAZY-NEXT:    [[SHL_2:%.*]] = shl i32 [[ZEXT_2]], 8
+; LAZY-NEXT:    [[ZEXT_3:%.*]] = zext i8 [[VEC_3]] to i32
+; LAZY-NEXT:    [[SHL_3:%.*]] = shl i32 [[ZEXT_3]], 24
+; LAZY-NEXT:    [[OUT_2:%.*]] = or i32 [[SHL_2]], [[SHL_3]]
+; LAZY-NEXT:    [[OUT:%.*]] = or i32 [[OUT_1]], [[OUT_2]]
+; LAZY-NEXT:    ret i32 [[OUT]]
+;
+; AGGRESSIVE-LABEL: define i32 @shuffle_elts(
+; AGGRESSIVE-SAME: <4 x i8> [[VEC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[VEC_CAST:%.*]] = bitcast <4 x i8> [[VEC]] to i32
+; AGGRESSIVE-NEXT:    [[VEC_MASK:%.*]] = and i32 [[VEC_CAST]], -16776961
+; AGGRESSIVE-NEXT:    [[VEC_SHIFT:%.*]] = lshr i32 [[VEC_CAST]], 8
+; AGGRESSIVE-NEXT:    [[VEC_MASK2:%.*]] = and i32 [[VEC_SHIFT]], 65280
+; AGGRESSIVE-NEXT:    [[VEC_SHIFT4:%.*]] = shl i32 [[VEC_CAST]], 8
+; AGGRESSIVE-NEXT:    [[VEC_MASK6:%.*]] = and i32 [[VEC_SHIFT4]], 16711680
+; AGGRESSIVE-NEXT:    [[OUT_MERGE:%.*]] = or disjoint i32 [[VEC_MASK]], [[VEC_MASK2]]
+; AGGRESSIVE-NEXT:    [[OUT_MERGE8:%.*]] = or disjoint i32 [[OUT_MERGE]], [[VEC_MASK6]]
+; AGGRESSIVE-NEXT:    ret i32 [[OUT_MERGE8]]
+;
+  %vec.0 = extractelement <4 x i8> %vec, i32 0
+  %vec.1 = extractelement <4 x i8> %vec, i32 1
+  %vec.2 = extractelement <4 x i8> %vec, i32 2
+  %vec.3 = extractelement <4 x i8> %vec, i32 3
+
+  %zext.0 = zext i8 %vec.0 to i32
+
+  %zext.1 = zext i8 %vec.1 to i32
+  %shl.1 = shl i32 %zext.1, 16
+  %out.1 = or i32 %zext.0, %shl.1
+
+  %zext.2 = zext i8 %vec.2 to i32
+  %shl.2 = shl i32 %zext.2, 8
+
+  %zext.3 = zext i8 %vec.3 to i32
+  %shl.3 = shl i32 %zext.3, 24
+  %out.2 = or i32 %shl.2, %shl.3
+
+  %out = or i32 %out.1, %out.2
+
+  ret i32 %out
+}
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
new file mode 100644
index 0000000000000..5baefc7fb6cda
--- /dev/null
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+define <4 x i8> @obtain_v4i8(<2 x i16> %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_v4i8(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; LAZY-NEXT:    ret <4 x i8> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_v4i8(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT:    ret <4 x i8> [[FROM_CAST]]
+;
+  %from.0 = extractelement <2 x i16> %from, i64 0
+  %mask.0 = trunc i16 %from.0 to i8
+  %shr.1 = lshr i16 %from.0, 8
+  %mask.1 = trunc i16 %shr.1 to i8
+
+  %from.1 = extractelement <2 x i16> %from, i64 1
+  %mask.2 = trunc i16 %from.1 to i8
+  %shr.3 = lshr i16 %from.1, 8
+  %mask.3 = trunc i16 %shr.3 to i8
+
+  %build.0 = insertelement <4 x i8> poison, i8 %mask.0, i64 0
+  %build.1 = insertelement <4 x i8> %build.0, i8 %mask.1, i64 1
+  %build.2 = insertelement <4 x i8> %build.1, i8 %mask.2, i64 2
+  %build.3 = insertelement <4 x i8> %build.2, i8 %mask.3, i64 3
+
+  ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_v2i16(<4 x i8> %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_v2i16(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; LAZY-NEXT:    ret <2 x i16> [[FROM_CAST]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_v2i16(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT:    ret <2 x i16> [[FROM_CAST]]
+;
+  %from.0 = extractelement <4 x i8> %from, i64 0
+  %zext.0 = zext i8 %from.0 to i16
+
+  %from.1 = extractelement <4 x i8> %from, i64 1
+  %zext.1 = zext i8 %from.1 to i16
+  %shl.1 = shl i16 %zext.1, 8
+  %out.1 = or i16 %zext.0, %shl.1
+
+  %from.2 = extractelement <4 x i8> %from, i64 2
+  %zext.2 = zext i8 %from.2 to i16
+
+  %from.3 = extractelement <4 x i8> %from, i64 3
+  %zext.3 = zext i8 %from.3 to i16
+  %shl.3 = shl i16 %zext.3, 8
+  %out.2 = or i16 %zext.2, %shl.3
+
+  %build.0 = insertelement <2 x i16> poison, i16 %out.1, i64 0
+  %build.1 = insertelement <2 x i16> %build.0, i16 %out.2, i64 1
+
+  ret <2 x i16> %build.1
+}
+
+define <4 x i8> @obtain_v4i8_masked(<2 x i16> %from) {
+; LAZY-LABEL: define <4 x i8> @obtain_v4i8_masked(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; LAZY-NEXT:    ret <4 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @obtain_v4i8_masked(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; AGGRESSIVE-NEXT:    ret <4 x i8> [[FROM_SHUFFLE]]
+;
+  %from.0 = extractelement <2 x i16> %from, i64 0
+  %mask.0 = trunc i16 %from.0 to i8
+  %shr.1 = lshr i16 %from.0, 8
+  %mask.1 = trunc i16 %shr.1 to i8
+
+  %from.1 = extractelement <2 x i16> %from, i64 1
+  %shr.3 = lshr i16 %from.1, 8
+  %mask.3 = trunc i16 %shr.3 to i8
+
+  %build.0 = insertelement <4 x i8> <i8 poison, i8 poison, i8 0, i8 poison>, i8 %mask.0, i64 0
+  %build.1 = insertelement <4 x i8> %build.0, i8 %mask.1, i64 1
+  %build.3 = insertelement <4 x i8> %build.1, i8 %mask.3, i64 3
+
+  ret <4 x i8> %build.3
+}
+
+define <2 x i16> @obtain_v2i16_masked(<4 x i8> %from) {
+; LAZY-LABEL: define <2 x i16> @obtain_v2i16_masked(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; LAZY-NEXT:    [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; LAZY-NEXT:    ret <2 x i16> [[FROM_MASK]]
+;
+; AGGRESSIVE-LABEL: define <2 x i16> @obtain_v2i16_masked(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT:    [[FROM_MASK:%.*]] = and <2 x i16> [[FROM_CAST]], <i16 -1, i16 -256>
+; AGGRESSIVE-NEXT:    ret <2 x i16> [[FROM_MASK]]
+;
+  %from.0 = extractelement <4 x i8> %from, i64 0
+  %zext.0 = zext i8 %from.0 to i16
+
+  %from.1 = extractelement <4 x i8> %from, i64 1
+  %zext.1 = zext i8 %from.1 to i16
+  %shl.1 = shl i16 %zext.1, 8
+  %out.1 = or i16 %zext.0, %shl.1
+
+  %from.3 = extractelement <4 x i8> %from, i64 3
+  %zext.3 = zext i8 %from.3 to i16
+  %shl.3 = shl i16 %zext.3, 8
+
+  %build.0 = insertelement <2 x i16> poison, i16 %out.1, i64 0
+  %build.1 = insertelement <2 x i16> %build.0, i16 %shl.3, i64 1
+
+  ret <2 x i16> %build.1
+}
+
+define <8 x i8> @obtain_v4i8_shifted(<2 x i16> %from) {
+; LAZY-LABEL: define <8 x i8> @obtain_v4i8_shifted(
+; LAZY-SAME: <2 x i16> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; LAZY-NEXT:    [[FROM_EXTRACT:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT:    ret <8 x i8> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @obtain_v4i8_shifted(
+; AGGRESSIVE-SAME: <2 x i16> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <2 x i16> [[FROM]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[FROM_EXTRACT:%.*]] = shufflevector <4 x i8> [[FROM_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <8 x i8> [[FROM_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    ret <8 x i8> [[FROM_SHUFFLE]]
+;
+  %from.0 = extractelement <2 x i16> %from, i64 0
+  %mask.0 = trunc i16 %from.0 to i8
+  %shr.1 = lshr i16 %from.0, 8
+  %mask.1 = trunc i16 %shr.1 to i8
+
+  %from.1 = extractelement <2 x i16> %from, i64 1
+  %mask.2 = trunc i16 %from.1 to i8
+  %shr.3 = lshr i16 %from.1, 8
+  %mask.3 = trunc i16 %shr.3 to i8
+
+  %build.0 = insertelement <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %mask.0, i64 4
+  %build.1 = insertelement <8 x i8> %build.0, i8 %mask.1, i64 5
+  %build.2 = insertelement <8 x i8> %build.1, i8 %mask.2, i64 6
+  %build.3 = insertelement <8 x i8> %build.2, i8 %mask.3, i64 7
+
+  ret <8 x i8> %build.3
+}
+
+define <4 x i16> @obtain_v2i16_shifted(<4 x i8> %from) {
+; LAZY-LABEL: define <4 x i16> @obtain_v2i16_shifted(
+; LAZY-SAME: <4 x i8> [[FROM:%.*]]) {
+; LAZY-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; LAZY-NEXT:    [[FROM_EXTRACT:%.*]] = shufflevector <2 x i16> [[FROM_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; LAZY-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; LAZY-NEXT:    ret <4 x i16> [[FROM_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @obtain_v2i16_shifted(
+; AGGRESSIVE-SAME: <4 x i8> [[FROM:%.*]]) {
+; AGGRESSIVE-NEXT:    [[FROM_CAST:%.*]] = bitcast <4 x i8> [[FROM]] to <2 x i16>
+; AGGRESSIVE-NEXT:    [[FROM_EXTRACT:%.*]] = shufflevector <2 x i16> [[FROM_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AGGRESSIVE-NEXT:    [[FROM_SHUFFLE:%.*]] = shufflevector <4 x i16> [[FROM_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    ret <4 x i16> [[FROM_SHUFFLE]]
+;
+  %from.0 = extractelement <4 x i8> %from, i64 0
+  %zext.0 = zext i8 %from.0 to i16
+
+  %from.1 = extractelement <4 x i8> %from, i64 1
+  %zext.1 = zext i8 %from.1 to i16
+  %shl.1 = shl i16 %zext.1, 8
+  %out.1 = or i16 %zext.0, %shl.1
+
+  %from.2 = extractelement <4 x i8> %from, i64 2
+  %zext.2 = zext i8 %from.2 to i16
+
+  %from.3 = extractelement <4 x i8> %from, i64 3
+  %zext.3 = zext i8 %from.3 to i16
+  %shl.3 = shl i16 %zext.3, 8
+  %out.2 = or i16 %zext.2, %shl.3
+
+  %build.0 = insertelement <4 x i16> <i16 0, i16 0, i16 poison, i16 poison>, i16 %out.1, i64 2
+  %build.1 = insertelement <4 x i16> %build.0, i16 %out.2, i64 3
+
+  ret <4 x i16> %build.1
+}
+
+define <8 x i8> @combine_v4i8(<4 x i8> %bot, <2 x i16> %top) {
+; LAZY-LABEL: define <8 x i8> @combine_v4i8(
+; LAZY-SAME: <4 x i8> [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; LAZY-NEXT:    [[TOP_CAST:%.*]] = bitcast <2 x i16> [[TOP]] to <4 x i8>
+; LAZY-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; LAZY-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; LAZY-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; LAZY-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; LAZY-NEXT:    ret <8 x i8> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <8 x i8> @combine_v4i8(
+; AGGRESSIVE-SAME: <4 x i8> [[BOT:%.*]], <2 x i16> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT:    [[TOP_CAST:%.*]] = bitcast <2 x i16> [[TOP]] to <4 x i8>
+; AGGRESSIVE-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <4 x i8> [[BOT]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <8 x i8> [[BOT_EXTRACT]], <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AGGRESSIVE-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <4 x i8> [[TOP_CAST]], <4 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <8 x i8> [[TOP_EXTRACT]], <8 x i8> [[BOT_SHUFFLE]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    ret <8 x i8> [[TOP_SHUFFLE]]
+;
+  %base = shufflevector <4 x i8> %bot, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+
+  %top.0 = extractelement <2 x i16> %top, i64 0
+  %mask.0 = trunc i16 %top.0 to i8
+  %shr.1 = lshr i16 %top.0, 8
+  %mask.1 = trunc i16 %shr.1 to i8
+
+  %top.1 = extractelement <2 x i16> %top, i64 1
+  %mask.2 = trunc i16 %top.1 to i8
+  %shr.3 = lshr i16 %top.1, 8
+  %mask.3 = trunc i16 %shr.3 to i8
+
+  %build.0 = insertelement <8 x i8> %base, i8 %mask.0, i64 4
+  %build.1 = insertelement <8 x i8> %build.0, i8 %mask.1, i64 5
+  %build.2 = insertelement <8 x i8> %build.1, i8 %mask.2, i64 6
+  %build.3 = insertelement <8 x i8> %build.2, i8 %mask.3, i64 7
+
+  ret <8 x i8> %build.3
+}
+
+define <4 x i16> @combine_v2i16(<2 x i16> %bot, <4 x i8> %top) {
+; LAZY-LABEL: define <4 x i16> @combine_v2i16(
+; LAZY-SAME: <2 x i16> [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; LAZY-NEXT:    [[TOP_CAST:%.*]] = bitcast <4 x i8> [[TOP]] to <2 x i16>
+; LAZY-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; LAZY-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; LAZY-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; LAZY-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; LAZY-NEXT:    ret <4 x i16> [[TOP_SHUFFLE]]
+;
+; AGGRESSIVE-LABEL: define <4 x i16> @combine_v2i16(
+; AGGRESSIVE-SAME: <2 x i16> [[BOT:%.*]], <4 x i8> [[TOP:%.*]]) {
+; AGGRESSIVE-NEXT:    [[TOP_CAST:%.*]] = bitcast <4 x i8> [[TOP]] to <2 x i16>
+; AGGRESSIVE-NEXT:    [[BOT_EXTRACT:%.*]] = shufflevector <2 x i16> [[BOT]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AGGRESSIVE-NEXT:    [[BOT_SHUFFLE:%.*]] = shufflevector <4 x i16> [[BOT_EXTRACT]], <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    [[TOP_EXTRACT:%.*]] = shufflevector <2 x i16> [[TOP_CAST]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AGGRESSIVE-NEXT:    [[TOP_SHUFFLE:%.*]] = shufflevector <4 x i16> [[TOP_EXTRACT]], <4 x i16> [[BOT_SHUFFLE]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AGGRESSIVE-NEXT:    ret <4 x i16> [[TOP_SHUFFLE]]
+;
+  %base = shufflevector <2 x i16> %bot, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+
+  %top.0 = extractelement <4 x i8> %top, i64 0
+  %zext.0 = zext i8 %top.0 to i16
+
+  %top.1 = extractelement <4 x i8> %top, i64 1
+  %zext.1 = zext i8 %top.1 to i16
+  %shl.1 = shl i16 %zext.1, 8
+  %out.1 = or i16 %zext.0, %shl.1
+
+  %top.2 = extractelement <4 x i8> %top, i64 2
+  %zext.2 = zext i8 %top.2 to i16
+
+  %top.3 = extractelement <4 x i8> %top, i64 3
+  %zext.3 = zext i8 %top.3 to i16
+  %shl.3 = shl i16 %zext.3, 8
+  %out.2 = or i16 %zext.2, %shl.3
+
+  %build.0 = insertelement <4 x i16> %base, i16 %out.1, i64 2
+  %build.1 = insertelement <4 x i16> %build.0, i16 %out.2, i64 3
+
+  ret <4 x i16> %build.1
+}
+
+define <4 x i8> @shuffle_elts(<4 x i8> %vec) {
+; LAZY-LABEL: define <4 x i8> @shuffle_elts(
+; LAZY-SAME: <4 x i8> [[VEC:%.*]]) {
+; LAZY-NEXT:    [[SHUFFLED:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; LAZY-NEXT:    ret <4 x i8> [[SHUFFLED]]
+;
+; AGGRESSIVE-LABEL: define <4 x i8> @shuffle_elts(
+; AGGRESSIVE-SAME: <4 x i8> [[VEC:%.*]]) {
+; AGGRESSIVE-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AGGRESSIVE-NEXT:    [[VEC_SHUFFLE2:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> [[VEC_SHUFFLE]], <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; AGGRESSIVE-NEXT:    [[VEC_SHUFFLE4:%.*]] = shufflevector <4 x i8> [[VEC]], <4 x i8> [[VEC_SHUFFLE2]], <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+; AGGRESSIVE-NEXT:    ret <4 x i8> [[VEC_SHUFFLE4]]
+;
+  %shuffled = shufflevector <4 x i8> %vec, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i8> %shuffled
+}

From 932bc6bf48264f927a3e051d5273808ee891ba8d Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe@amd.com>
Date: Thu, 3 Jul 2025 18:42:57 -0500
Subject: [PATCH 2/2] Address a subset of the reviewer feedback.

---
 llvm/include/llvm/Transforms/Scalar.h         |   4 +-
 .../Scalar/PackedIntegerCombinePass.h         |  18 +-
 llvm/lib/Passes/PassBuilder.cpp               |  29 ++
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   2 +-
 llvm/lib/Passes/PassRegistry.def              |   7 +-
 .../Scalar/PackedIntegerCombinePass.cpp       | 392 ++++++++----------
 .../PackedIntegerCombine/instructions.ll      |  51 +--
 .../PackedIntegerCombine/int2int.ll           |   4 +-
 .../PackedIntegerCombine/int2vec.ll           |   4 +-
 .../PackedIntegerCombine/vec2int.ll           |   4 +-
 .../PackedIntegerCombine/vec2vec.ll           |   4 +-
 11 files changed, 228 insertions(+), 291 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index ec9d89507c375..bd5b112ed6105 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -159,7 +159,9 @@ LLVM_ABI extern char &InferAddressSpacesID;
 // PackedIntegerCombinePass - Tracks individual bytes through instructions to
 // systematically identify redundant byte packing or unpacking operations.
 //
-LLVM_ABI FunctionPass *createPackedIntegerCombinePass();
+LLVM_ABI FunctionPass *
+createPackedIntegerCombinePass(unsigned MaxCollectionIterations = 2,
+                               bool AggressiveRewriting = false);
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
index a5916e2e611cf..2ca56c06784dc 100644
--- a/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
+++ b/llvm/include/llvm/Transforms/Scalar/PackedIntegerCombinePass.h
@@ -21,10 +21,26 @@
 
 namespace llvm {
 
+struct PackedIntegerCombineOptions {
+  /// Maximum number of iterations to isolate final packed instructions.
+  unsigned MaxCollectionIterations = 2;
+  /// Aggressively rewrite packed instructions.
+  bool AggressiveRewriting = false;
+};
+
 class PackedIntegerCombinePass
     : public PassInfoMixin<PackedIntegerCombinePass> {
+
+  PackedIntegerCombineOptions Options;
+
 public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  PackedIntegerCombinePass(PackedIntegerCombineOptions Options = {})
+      : Options(Options) {}
+
+  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  LLVM_ABI void
+  printPipeline(raw_ostream &OS,
+                function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 7a382ace34dbc..911a043fbae53 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1170,6 +1170,35 @@ Expected<bool> parseMergedLoadStoreMotionOptions(StringRef Params) {
   return Result;
 }
 
+Expected<PackedIntegerCombineOptions>
+parsePackedIntegerCombineOptions(StringRef Params) {
+  PackedIntegerCombineOptions Options;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName.consume_front("max-iterations=")) {
+      if (ParamName.getAsInteger(0, Options.MaxCollectionIterations))
+        return make_error<StringError>(
+            formatv("invalid max iteration count for PackedIntegerCombine "
+                    "pass: '{}'",
+                    ParamName)
+                .str(),
+            inconvertibleErrorCode());
+    } else if (ParamName == "aggressive") {
+      Options.AggressiveRewriting = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid argument for PackedIntegerCombinePass: '{}'",
+                  ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+
+  return Options;
+}
+
 Expected<GVNOptions> parseGVNOptions(StringRef Params) {
   GVNOptions Result;
   while (!Params.empty()) {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 2da72606bc47a..9b2254607d954 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -2361,4 +2361,4 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
 bool PassBuilder::isInstrumentedPGOUse() const {
   return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) ||
          !UseCtxProfile.empty();
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 6f1c405a5efa7..812eb5d3c15c4 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -476,7 +476,6 @@ FUNCTION_PASS("objc-arc", ObjCARCOptPass())
 FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass())
 FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass())
 FUNCTION_PASS("pa-eval", PAEvalPass())
-FUNCTION_PASS("packedintcombine", PackedIntegerCombinePass())
 FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
 FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("place-safepoints", PlaceSafepointsPass())
@@ -628,6 +627,12 @@ FUNCTION_PASS_WITH_PARAMS(
       return MergedLoadStoreMotionPass(Opts);
     },
     parseMergedLoadStoreMotionOptions, "no-split-footer-bb;split-footer-bb")
+FUNCTION_PASS_WITH_PARAMS(
+  "packed-integer-combine", "PackedIntegerCombinePass",
+  [](PackedIntegerCombineOptions Options) {
+    return PackedIntegerCombinePass(Options);
+  },
+  parsePackedIntegerCombineOptions, "max-iterations=N;aggressive")
 FUNCTION_PASS_WITH_PARAMS(
     "print<access-info>", "LoopAccessInfoPrinterPass",
     [](bool AllowPartial) {
diff --git a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
index 31edd28069a2b..fc3b57b71bf45 100644
--- a/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
+++ b/llvm/lib/Transforms/Scalar/PackedIntegerCombinePass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// This file provides the interface for LLVM's Packed Integer Combine pass.
+/// This file implements the interface for LLVM's Packed Integer Combine pass.
 /// This pass tries to treat integers as packed chunks of individual bytes,
 /// and leverage this to coalesce needlessly fragmented
 /// computations.
@@ -24,21 +24,11 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
+#include <variant>
 
 using namespace llvm;
 
-#define DEBUG_TYPE "packedintcombine"
-
-static cl::opt<unsigned> MaxCollectionIterations(
-    "packedint-max-iterations",
-    cl::desc("Maximum number of iterations to isolate final packed "
-             "instructions. Set to 0 to iterate until convergence."),
-    cl::init(2), cl::Hidden);
-
-static cl::opt<bool>
-    AggressiveRewriting("packedint-aggressive-rewriter",
-                        cl::desc("Aggressively rewrite packed instructions."),
-                        cl::init(false), cl::Hidden);
+#define DEBUG_TYPE "packed-integer-combine"
 
 namespace {
 
@@ -101,10 +91,10 @@ class Byte {
     else
       Base->printAsOperand(ROS, false);
 
-    ROS << "[" << Integer << "]";
+    ROS << '[' << Integer << ']';
 
     if (NewLine)
-      ROS << "\n";
+      ROS << '\n';
   }
 
   LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
@@ -128,7 +118,7 @@ struct ByteLayout {
 };
 
 /// Interpret the given type as a number of packed bytes, if possible.
-static std::optional<ByteLayout> tryGetByteLayout(const Type *Ty) {
+static std::optional<ByteLayout> getByteLayout(const Type *Ty) {
   unsigned IntBitWidth, NumElts;
   if (const auto *IntTy = dyn_cast<IntegerType>(Ty)) {
     IntBitWidth = IntTy->getBitWidth();
@@ -148,13 +138,6 @@ static std::optional<ByteLayout> tryGetByteLayout(const Type *Ty) {
   return ByteLayout{IntBitWidth / Byte::BitWidth, NumElts};
 }
 
-/// Interpret the given type as a number of backed bytes (aborts if impossible).
-static ByteLayout getByteLayout(const Type *Ty) {
-  const std::optional<ByteLayout> Layout = tryGetByteLayout(Ty);
-  assert(Layout);
-  return *Layout;
-}
-
 /// A convenience class for combining Byte instances obtained from the same base
 /// value, and with a common relative offset, which can hence be obtained
 /// simultaneously.
@@ -167,7 +150,7 @@ struct CoalescedBytes {
   ///
   /// For instance, if bytes 3, 4, 5 of some value %val are coalesced to provide
   /// bytes 0, 1, 2 of the target %tgt, then ShrByteOffset = 3.
-  signed SignedShrByteOffset;
+  int SignedShrByteOffset;
   /// The bitmask identifying which bytes of the target value are covered by
   /// these coalesced bytes.
   ///
@@ -176,12 +159,12 @@ struct CoalescedBytes {
   /// be set, corresponding to the first three bits of %tgt.
   SmallBitVector Mask;
 
-  explicit CoalescedBytes(Value &Base, signed Offset, SmallBitVector Mask)
+  explicit CoalescedBytes(Value &Base, int Offset, SmallBitVector Mask)
       : Base(&Base), SignedShrByteOffset(Offset), Mask(Mask) {}
-  explicit CoalescedBytes(Value &Base, signed Offset, unsigned NumBytes)
+  explicit CoalescedBytes(Value &Base, int Offset, unsigned NumBytes)
       : Base(&Base), SignedShrByteOffset(Offset), Mask(NumBytes) {}
 
-  bool alignsWith(Value *V, signed VOffset) const {
+  bool alignsWith(Value *V, int VOffset) const {
     return Base == V && SignedShrByteOffset == VOffset;
   }
 
@@ -206,16 +189,16 @@ struct CoalescedBytes {
     for (unsigned Idx = 0; Idx < Mask.size(); ++Idx) {
       if (Mask.test(Idx)) {
         Base->printAsOperand(ROS, false);
-        ROS << "[" << (static_cast<int>(Idx) + SignedShrByteOffset) << "]";
+        ROS << '[' << (static_cast<int>(Idx) + SignedShrByteOffset) << ']';
       } else
         ROS << 0;
 
       ROS << "; ";
     }
-    ROS << "}";
+    ROS << '}';
 
     if (NewLine)
-      ROS << "\n";
+      ROS << '\n';
   }
 
   LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
@@ -254,67 +237,23 @@ using ByteVector = SmallVector<ByteUse, 8>;
 
 /// The decomposition of an IR value into its individual bytes, tracking where
 /// each byte is obtained.
-class ByteDefinition {
-  /// Enum classifying what Ptr points to.
-  enum ByteType : uint8_t {
-    /// Ptr's value is undefined.
-    INVALID,
-    /// The byte definition is given by a ByteVector, which is referenced (but
-    /// not captured) by Ptr.
-    VECTOR,
-    /// The bytes are obtained from a (currently opaque) IR value, held by Ptr.
-    VALUE,
-    /// The bytes are obtained from a constant integer, held by Ptr.
-    CONST_INT,
-    /// The bytes are obtained from a constant vector of integers, held by Ptr.
-    CONST_VEC,
-  };
-
-  ByteType DefType;
-  void *Ptr;
+struct ByteDefinition {
+  std::variant<std::nullopt_t, ByteVector *, Value *> Ptr;
   ByteLayout Layout;
-  ByteDefinition(ByteType DefType, void *Ptr, ByteLayout Layout)
-      : DefType(DefType), Ptr(Ptr), Layout(Layout) {}
 
 public:
   /// Indicate that a value cannot be decomposed into bytes in a known way.
-  static ByteDefinition invalid() { return {INVALID, nullptr, {0, 0}}; }
+  static ByteDefinition invalid() { return {std::nullopt, {0, 0}}; }
   /// Indicate that a value's bytes are known, and track their producers.
   static ByteDefinition vector(ByteVector &Ref, ByteLayout Layout) {
-    return {VECTOR, &Ref, Layout};
+    return {&Ref, Layout};
   }
   /// Indicate that a value's bytes are opaque.
   static ByteDefinition value(Value &V) {
-    return {VALUE, &V, getByteLayout(V.getType())};
-  }
-  /// Indicate that the bytes come from a constant integer.
-  static ByteDefinition constInt(ConstantInt &Int) {
-    return {CONST_INT, &Int, getByteLayout(Int.getType())};
-  }
-  /// Indicate that the bytes come from a constant vector of integers.
-  static ByteDefinition constVec(Constant &Vec) {
-    assert(Vec.getType()->isVectorTy());
-    return {CONST_VEC, &Vec, getByteLayout(Vec.getType())};
-  }
-
-  ByteVector &getVector() const {
-    assert(DefType == VECTOR);
-    return *static_cast<ByteVector *>(Ptr);
-  }
-  Value &getValue() const {
-    assert(DefType == VALUE);
-    return *static_cast<Value *>(Ptr);
-  }
-  ConstantInt &getConstInt() const {
-    assert(DefType == CONST_INT);
-    return *static_cast<ConstantInt *>(Ptr);
-  }
-  Constant &getConstVec() const {
-    assert(DefType == CONST_VEC);
-    return *static_cast<Constant *>(Ptr);
+    return {&V, *getByteLayout(V.getType())};
   }
 
-  bool isValid() const { return DefType != INVALID; }
+  bool isValid() const { return !std::holds_alternative<std::nullopt_t>(Ptr); }
 
   /// Return true iff the byte definition is valid.
   operator bool() const { return isValid(); }
@@ -322,62 +261,65 @@ class ByteDefinition {
   /// Get the definition of the byte at the specified byte offset, where 0 is
   /// the least significant byte.
   Byte getByte(unsigned Idx) const {
-    switch (DefType) {
-    default:
-      llvm_unreachable("Invalid byte definition");
-    case VECTOR:
-      return getVector()[Idx].getByte();
-    case VALUE:
-      return Byte(getValue(), Idx);
-    case CONST_INT:
-      return Byte(getConstInt().getValue().extractBitsAsZExtValue(
-          Byte::BitWidth, Idx * Byte::BitWidth));
-    case CONST_VEC: {
-      const auto &Vec = getConstVec();
-      const ByteLayout Layout = getByteLayout(Vec.getType());
-      const unsigned VecIdx = Idx / Layout.NumBytesPerElement;
-      const unsigned EltIdx = Idx % Layout.NumBytesPerElement;
-
-      Constant *Elt = Vec.getAggregateElement(VecIdx);
-      if (const auto *Int = dyn_cast<ConstantInt>(Elt))
-        return Byte(Int->getValue().extractBitsAsZExtValue(
-            Byte::BitWidth, EltIdx * Byte::BitWidth));
-
-      return Byte(*Elt, EltIdx);
-    }
-    }
+    struct Visitor {
+      unsigned Idx;
+
+      Byte operator()(std::nullopt_t) {
+        llvm_unreachable("Invalid byte definition");
+      }
+      Byte operator()(ByteVector *BV) { return (*BV)[Idx].getByte(); }
+      Byte operator()(Value *V) {
+        if (auto *Int = dyn_cast<ConstantInt>(V))
+          return Byte(Int->getValue().extractBitsAsZExtValue(
+              Byte::BitWidth, Idx * Byte::BitWidth));
+
+        if (V->getType()->isVectorTy()) {
+          if (auto *Vec = dyn_cast<Constant>(V)) {
+            const ByteLayout Layout = *getByteLayout(Vec->getType());
+            const unsigned VecIdx = Idx / Layout.NumBytesPerElement;
+            const unsigned EltIdx = Idx % Layout.NumBytesPerElement;
+
+            if (Constant *Elt = Vec->getAggregateElement(VecIdx)) {
+              if (const auto *Int = dyn_cast<ConstantInt>(Elt))
+                return Byte(Int->getValue().extractBitsAsZExtValue(
+                    Byte::BitWidth, EltIdx * Byte::BitWidth));
+
+              return Byte(*Elt, EltIdx);
+            }
+          }
+        }
+
+        return Byte(*V, Idx);
+      }
+    };
+
+    return std::visit(Visitor{Idx}, Ptr);
   }
 
   const ByteLayout &getLayout() const { return Layout; }
 
   void print(raw_ostream &ROS, bool NewLine = true) const {
-    switch (DefType) {
-    default:
-      ROS << "[INVALID]";
-      break;
-    case VECTOR: {
-      ByteVector &BV = getVector();
-      ROS << "{ ";
-      for (unsigned ByteIdx = 0; ByteIdx < BV.size(); ++ByteIdx)
-        ROS << ByteIdx << ": " << BV[ByteIdx].getByte() << "; ";
-      ROS << "}";
-      break;
-    }
-    case VALUE:
-      ROS << "(";
-      getValue().printAsOperand(ROS);
-      ROS << ")[0:" << Layout.getNumBytes() << "]";
-      break;
-    case CONST_INT:
-      ROS << getConstInt();
-      break;
-    case CONST_VEC:
-      ROS << getConstVec();
-      break;
-    }
+    struct Visitor {
+      raw_ostream &ROS;
+      const ByteLayout &Layout;
+
+      void operator()(std::nullopt_t) { ROS << "[INVALID]"; }
+      void operator()(ByteVector *BV) {
+        ROS << "{ ";
+        for (unsigned ByteIdx = 0; ByteIdx < BV->size(); ++ByteIdx)
+          ROS << ByteIdx << ": " << (*BV)[ByteIdx].getByte() << "; ";
+        ROS << '}';
+      }
+      void operator()(Value *V) {
+        ROS << '(';
+        V->printAsOperand(ROS);
+        ROS << ")[0:" << Layout.getNumBytes() << ']';
+      }
+    };
 
+    std::visit(Visitor{ROS, Layout}, Ptr);
     if (NewLine)
-      ROS << "\n";
+      ROS << '\n';
   }
 
   LLVM_DUMP_METHOD void dump() const { print(errs(), true); }
@@ -415,7 +357,6 @@ class ByteExpander final : public InstVisitor<ByteExpander, ByteVector> {
   // Visitation implementations return `true` iff a new byte definition was
   // successfully constructed.
 
-  ByteVector visitAdd(BinaryOperator &I);
   ByteVector visitAnd(BinaryOperator &I);
   ByteVector visitOr(BinaryOperator &I);
   ByteVector visitXor(BinaryOperator &I);
@@ -455,47 +396,18 @@ class ByteExpander final : public InstVisitor<ByteExpander, ByteVector> {
 
   /// Iterate over all instructions in a function over several passes to
   /// identify all final values and their byte definitions.
-  std::vector<Instruction *> collectPIICandidates(Function &F);
+  std::vector<Instruction *>
+  collectPIICandidates(Function &F, unsigned MaxCollectionIterations);
 };
 
-ByteVector ByteExpander::visitAdd(BinaryOperator &I) {
-  const ByteDefinition LhsDef =
-      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
   const ByteDefinition RhsDef =
       getByteDefinitionIfIntermediateOperand(I.getOperand(1));
-  if (!LhsDef || !RhsDef)
+  if (!RhsDef)
     return {};
-
-  const ByteLayout &Layout = LhsDef.getLayout();
-  const unsigned NumBytes = Layout.getNumBytes();
-
-  ByteVector BV;
-  BV.reserve(NumBytes);
-
-  for (unsigned ByteIdx = 0; ByteIdx < NumBytes; ++ByteIdx) {
-    const Byte Lhs = LhsDef.getByte(ByteIdx);
-    const Byte Rhs = RhsDef.getByte(ByteIdx);
-
-    const bool LhsIsZero = Lhs.isConstant() && Lhs.getConstant() == 0;
-    const bool RhsIsZero = Rhs.isConstant() && Rhs.getConstant() == 0;
-    if (LhsIsZero)
-      BV.emplace_back(Rhs, RhsIsZero ? ByteUse::AllOperands : 1);
-    else if (RhsIsZero)
-      BV.emplace_back(Lhs, 0);
-    else
-      return {};
-  }
-
-  assert(BV.size() == NumBytes);
-  return BV;
-}
-
-ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
   const ByteDefinition LhsDef =
       getByteDefinitionIfIntermediateOperand(I.getOperand(0));
-  const ByteDefinition RhsDef =
-      getByteDefinitionIfIntermediateOperand(I.getOperand(1));
-  if (!LhsDef || !RhsDef)
+  if (!LhsDef)
     return {};
 
   const ByteLayout &Layout = LhsDef.getLayout();
@@ -546,11 +458,13 @@ ByteVector ByteExpander::visitAnd(BinaryOperator &I) {
 }
 
 ByteVector ByteExpander::visitOr(BinaryOperator &I) {
-  const ByteDefinition LhsDef =
-      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
   const ByteDefinition RhsDef =
       getByteDefinitionIfIntermediateOperand(I.getOperand(1));
-  if (!LhsDef || !RhsDef)
+  if (!RhsDef)
+    return {};
+  const ByteDefinition LhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!LhsDef)
     return {};
 
   const ByteLayout &Layout = LhsDef.getLayout();
@@ -598,11 +512,13 @@ ByteVector ByteExpander::visitOr(BinaryOperator &I) {
 }
 
 ByteVector ByteExpander::visitXor(BinaryOperator &I) {
-  const ByteDefinition LhsDef =
-      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
   const ByteDefinition RhsDef =
       getByteDefinitionIfIntermediateOperand(I.getOperand(1));
-  if (!LhsDef || !RhsDef)
+  if (!RhsDef)
+    return {};
+  const ByteDefinition LhsDef =
+      getByteDefinitionIfIntermediateOperand(I.getOperand(0));
+  if (!LhsDef)
     return {};
 
   const ByteLayout &Layout = LhsDef.getLayout();
@@ -629,6 +545,10 @@ ByteVector ByteExpander::visitXor(BinaryOperator &I) {
 }
 
 ByteVector ByteExpander::visitShl(BinaryOperator &I) {
+  const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+  if (!Const)
+    return {};
+
   const ByteDefinition BaseDef =
       getByteDefinitionIfIntermediateOperand(I.getOperand(0));
   if (!BaseDef)
@@ -636,10 +556,6 @@ ByteVector ByteExpander::visitShl(BinaryOperator &I) {
 
   const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
 
-  const auto *Const = dyn_cast<Constant>(I.getOperand(1));
-  if (!Const)
-    return {};
-
   if (isa<ConstantInt>(Const)) {
     const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
     if (ShAmt % Byte::BitWidth != 0)
@@ -683,6 +599,10 @@ ByteVector ByteExpander::visitShl(BinaryOperator &I) {
 }
 
 ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
+  const auto *Const = dyn_cast<Constant>(I.getOperand(1));
+  if (!Const)
+    return {};
+
   const ByteDefinition BaseDef =
       getByteDefinitionIfIntermediateOperand(I.getOperand(0));
   if (!BaseDef)
@@ -690,10 +610,6 @@ ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
 
   const unsigned NumBytes = BaseDef.getLayout().getNumBytes();
 
-  const auto *Const = dyn_cast<Constant>(I.getOperand(1));
-  if (!Const)
-    return {};
-
   if (isa<ConstantInt>(Const)) {
     const unsigned ShAmt = Const->getUniqueInteger().getLimitedValue();
     if (ShAmt % Byte::BitWidth != 0)
@@ -739,12 +655,12 @@ ByteVector ByteExpander::visitLShr(BinaryOperator &I) {
 }
 
 ByteVector ByteExpander::visitTruncInst(TruncInst &I) {
-  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
   if (!Layout)
     return {};
 
   const std::optional<ByteLayout> SrcLayout =
-      tryGetByteLayout(I.getOperand(0)->getType());
+      getByteLayout(I.getOperand(0)->getType());
   if (!SrcLayout)
     return {};
 
@@ -768,7 +684,7 @@ ByteVector ByteExpander::visitTruncInst(TruncInst &I) {
 }
 
 ByteVector ByteExpander::visitZExtInst(ZExtInst &I) {
-  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
   if (!Layout)
     return {};
 
@@ -798,7 +714,7 @@ ByteVector ByteExpander::visitZExtInst(ZExtInst &I) {
 }
 
 ByteVector ByteExpander::visitBitCastInst(BitCastInst &I) {
-  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
   if (!Layout)
     return {};
 
@@ -875,7 +791,7 @@ ByteVector ByteExpander::visitInsertElementInst(InsertElementInst &I) {
 }
 
 ByteVector ByteExpander::visitShuffleVectorInst(ShuffleVectorInst &I) {
-  const std::optional<ByteLayout> Layout = tryGetByteLayout(I.getType());
+  const std::optional<ByteLayout> Layout = getByteLayout(I.getType());
   if (!Layout)
     return {};
 
@@ -961,16 +877,10 @@ ByteVector *ByteExpander::expandByteDefinition(Value *V) {
 }
 
 ByteDefinition ByteExpander::getByteDefinition(Value *V, bool ExpandDef) {
-  const std::optional<ByteLayout> Layout = tryGetByteLayout(V->getType());
+  const std::optional<ByteLayout> Layout = getByteLayout(V->getType());
   if (!Layout)
     return ByteDefinition::invalid();
 
-  if (auto *ConstInt = dyn_cast<ConstantInt>(V))
-    return ByteDefinition::constInt(*ConstInt);
-  if (auto *Const = dyn_cast<Constant>(V))
-    if (Const->getType()->isVectorTy())
-      return ByteDefinition::constVec(*Const);
-
   if (ExpandDef)
     if (ByteVector *BV = expandByteDefinition(V))
       return ByteDefinition::vector(*BV, *Layout);
@@ -986,7 +896,7 @@ bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) {
   if (isa<Constant>(V))
     return true;
 
-  /// Short-circuit check.
+  // Short-circuit check.
   if (IsOperand && V->hasOneUse())
     return true;
 
@@ -997,13 +907,14 @@ bool ByteExpander::checkIfIntermediate(Value *V, bool IsOperand) {
   return Definitions.contains(*FU.begin());
 }
 
-std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
+std::vector<Instruction *>
+ByteExpander::collectPIICandidates(Function &F,
+                                   unsigned MaxCollectionIterations) {
   std::vector<Instruction *> PackedIntInsts;
-  LLVM_DEBUG(dbgs() << "PICP: Entering function " << F.getName() << "\n");
 
   unsigned NumIterations = 1;
   for (;;) {
-    LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << "\n");
+    LLVM_DEBUG(dbgs() << "PICP: Iteration " << NumIterations << '\n');
     bool Converged = true;
 
     std::vector<Instruction *> CollectedInsts;
@@ -1034,7 +945,7 @@ std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
         LLVM_DEBUG({
           dbgs() << "PICP: Updating definition: ";
           I.printAsOperand(dbgs());
-          dbgs() << " = " << getByteDefinition(&I) << "\n";
+          dbgs() << " = " << getByteDefinition(&I) << '\n';
         });
       }
     }
@@ -1056,7 +967,7 @@ std::vector<Instruction *> ByteExpander::collectPIICandidates(Function &F) {
     ++NumIterations;
   }
 
-  LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << "\n");
+  LLVM_DEBUG(dbgs() << "PICP: Total iterations: " << NumIterations << '\n');
   return PackedIntInsts;
 }
 
@@ -1203,7 +1114,7 @@ class BytePackFolder {
     LLVM_DEBUG({
       dbgs() << "PICP [";
       TargetInst->printAsOperand(dbgs());
-      dbgs() << "]: Queuing cast " << *CI << "\n";
+      dbgs() << "]: Queuing cast " << *CI << '\n';
     });
     return CI;
   }
@@ -1216,7 +1127,7 @@ class BytePackFolder {
     LLVM_DEBUG({
       dbgs() << "PICP [";
       TargetInst->printAsOperand(dbgs());
-      dbgs() << "]: Queuing inst " << *I << "\n";
+      dbgs() << "]: Queuing inst " << *I << '\n';
     });
     return I;
   }
@@ -1346,7 +1257,7 @@ class BytePackFolder {
 
     const unsigned NumTargetBytes = Layout.getNumBytes();
     Value *V = CB.Base;
-    const unsigned NumSrcBytes = getByteLayout(V->getType()).getNumBytes();
+    const unsigned NumSrcBytes = getByteLayout(V->getType())->getNumBytes();
     const StringRef &Name = V->getName();
 
     // Transformation: shr -> trunc -> mask -> zext -> shl
@@ -1418,7 +1329,7 @@ class BytePackFolder {
     const unsigned NumTargetBytes = Layout.getNumBytes();
     Value *V = CB.Base;
     const StringRef &Name = V->getName();
-    ByteLayout VecLayout = getByteLayout(V->getType());
+    ByteLayout VecLayout = *getByteLayout(V->getType());
 
     // For sub-element accesses, try to subdivide the vector into smaller
     // elements.
@@ -1431,7 +1342,7 @@ class BytePackFolder {
       auto *NewTy = FixedVectorType::get(TargetIntTy, VecLayout.NumVecElements *
                                                           SplitFactor);
       V = pushCast(Instruction::BitCast, V, NewTy);
-      VecLayout = getByteLayout(V->getType());
+      VecLayout = *getByteLayout(V->getType());
     }
 
     // Give up if bytes are obtained from a strange offset.
@@ -1526,7 +1437,7 @@ class BytePackFolder {
     auto *TargetVecTy = cast<FixedVectorType>(TargetInst->getType());
     Type *TargetEltTy = TargetVecTy->getElementType();
 
-    const ByteLayout SrcLayout = getByteLayout(CB.Base->getType());
+    const ByteLayout SrcLayout = *getByteLayout(CB.Base->getType());
     Value *V = CB.Base;
     const StringRef &Name = V->getName();
 
@@ -1602,7 +1513,7 @@ class BytePackFolder {
 
 public:
   BytePackFolder(Instruction *TargetV)
-      : TargetInst(TargetV), Layout(getByteLayout(TargetV->getType())),
+      : TargetInst(TargetV), Layout(*getByteLayout(TargetV->getType())),
         VectorAlignedPack(PartialBytePack::invalid()) {}
 
   ~BytePackFolder() {
@@ -1612,7 +1523,7 @@ class BytePackFolder {
       LLVM_DEBUG({
         dbgs() << "PICP [";
         TargetInst->printAsOperand(dbgs());
-        dbgs() << "]: Dequeuing cast " << *I << "\n";
+        dbgs() << "]: Dequeuing cast " << *I << '\n';
       });
       I->replaceAllUsesWith(PoisonValue::get(I->getType()));
       I->deleteValue();
@@ -1622,7 +1533,7 @@ class BytePackFolder {
       LLVM_DEBUG({
         dbgs() << "PICP [";
         TargetInst->printAsOperand(dbgs());
-        dbgs() << "]: Dequeuing inst " << *Insts.back() << "\n";
+        dbgs() << "]: Dequeuing inst " << *Insts.back() << '\n';
       });
       Insts.back()->deleteValue();
       Insts.pop_back();
@@ -1632,15 +1543,17 @@ class BytePackFolder {
   /// Try to generate instructions for coalescing the given bytes and aligning
   /// them to the target value. Returns true iff this is successful.
   bool pushCoalescedBytes(CoalescedBytes CB) {
-    if (isa<Constant>(CB.Base) && CB.SignedShrByteOffset == 0) {
-      WorkList.emplace_back(CB.Base, CB.Mask);
-      return true;
-    }
+    if (CB.SignedShrByteOffset == 0)
+      if (auto *Const = dyn_cast<Constant>(CB.Base)) {
+        WorkList.emplace_back(
+            ConstantExpr::getBitCast(Const, TargetInst->getType()), CB.Mask);
+        return true;
+      }
 
     LLVM_DEBUG({
       dbgs() << "PICP [";
       TargetInst->printAsOperand(dbgs());
-      dbgs() << "]: Preparing bytes " << CB << "\n";
+      dbgs() << "]: Preparing bytes " << CB << '\n';
     });
     if (isa<FixedVectorType>(TargetInst->getType())) {
       if (isa<FixedVectorType>(CB.Base->getType()))
@@ -1735,8 +1648,9 @@ struct PackedIntInstruction {
 /// If the rewriter is non-aggressive, return nullopt if the rewriting is
 /// determined to be unnecessary.
 static std::optional<SmallVector<CoalescedBytes, 8>>
-getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
-  const ByteLayout Layout = getByteLayout(Ty);
+getCoalescingOpportunity(Type *Ty, const ByteVector &BV,
+                         bool AggressiveRewriting) {
+  const ByteLayout Layout = *getByteLayout(Ty);
   assert(Layout.getNumBytes() == BV.size() &&
          "Byte definition has unexpected width.");
 
@@ -1761,8 +1675,8 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
     } else {
       CoalescedBytes *CB = nullptr;
       Value *Base = B.getBase();
-      const signed Offset =
-          static_cast<signed>(B.getIndex()) - static_cast<signed>(ByteIdx);
+      const int Offset =
+          static_cast<int>(B.getIndex()) - static_cast<int>(ByteIdx);
       for (unsigned CBIdx = 0; CBIdx < CBV.size(); ++CBIdx) {
         if (CBV[CBIdx].alignsWith(Base, Offset)) {
           CB = &CBV[CBIdx];
@@ -1773,7 +1687,7 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
             LLVM_DEBUG(dbgs()
                        << "PICP: Bytes " << *CB << " from operand " << OpIdx
                        << " can be coalesced with byte " << B
-                       << " from operand " << BU.getOperandIndex() << "\n");
+                       << " from operand " << BU.getOperandIndex() << '\n');
             OperandsAlreadyCoalesced = false;
           }
         }
@@ -1831,7 +1745,8 @@ getCoalescingOpportunity(Type *Ty, const ByteVector &BV) {
 /// Queue into \p PIIV the set of final values (or operands thereof, if the
 /// rewriter is non-aggressive) which are deemed beneficial to rewrite.
 static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
-                           Instruction &FinalInst, ByteExpander &BE) {
+                           Instruction &FinalInst, ByteExpander &BE,
+                           bool AggressiveRewriting) {
   SmallVector<Instruction *, 8> WorkList{&FinalInst};
   SmallPtrSet<Instruction *, 8> Seen{&FinalInst};
 
@@ -1844,15 +1759,15 @@ static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
       // This instruction is beyond the analysis scope of PICP.
       continue;
 
-    LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << "\n"
+    LLVM_DEBUG(dbgs() << "PICP rewrite candidate: " << *I << '\n'
                       << "             byte pack: " << BE.getByteDefinition(I)
-                      << "\n");
+                      << '\n');
     auto CBV = [&]() -> std::optional<SmallVector<CoalescedBytes, 8>> {
       // Short-circuit check for casts.
       if (!AggressiveRewriting && I->getNumOperands() == 1)
         return std::nullopt;
 
-      return getCoalescingOpportunity(I->getType(), *BV);
+      return getCoalescingOpportunity(I->getType(), *BV, AggressiveRewriting);
     }();
 
     if (!CBV) {
@@ -1868,19 +1783,20 @@ static void queueRewriting(std::vector<PackedIntInstruction> &PIIV,
   } while (!WorkList.empty());
 }
 
-static bool runImpl(Function &F) {
+static bool runImpl(Function &F, PackedIntegerCombineOptions Options) {
   ByteExpander BE;
 
-  std::vector<Instruction *> PIICandidates = BE.collectPIICandidates(F);
+  std::vector<Instruction *> PIICandidates =
+      BE.collectPIICandidates(F, Options.MaxCollectionIterations);
   std::vector<PackedIntInstruction> PIIV;
 
   for (Instruction *I : PIICandidates) {
     if (!BE.checkIfIntermediate(I))
-      queueRewriting(PIIV, *I, BE);
+      queueRewriting(PIIV, *I, BE, Options.AggressiveRewriting);
     else
-      LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << "\n"
+      LLVM_DEBUG(dbgs() << "PICP intermediate inst: " << *I << '\n'
                         << "            final user: "
-                        << **BE.getFinalUsers(I).begin() << "\n");
+                        << **BE.getFinalUsers(I).begin() << '\n');
   }
 
   DenseMap<Instruction *, Value *> InstSubs;
@@ -1888,7 +1804,7 @@ static bool runImpl(Function &F) {
   for (const PackedIntInstruction &PII : PIIV)
     if (Value *V = PII.rewrite(IRB)) {
       LLVM_DEBUG(dbgs() << "PICP rewrite successful for " << *PII.TargetInst
-                        << "\n");
+                        << '\n');
       InstSubs[PII.TargetInst] = V;
     }
 
@@ -1907,12 +1823,15 @@ static bool runImpl(Function &F) {
 }
 
 class PackedIntegerCombineLegacyPass : public FunctionPass {
+  PackedIntegerCombineOptions Options;
+
 public:
   static char ID;
 
-  PackedIntegerCombineLegacyPass() : FunctionPass(ID) {}
+  PackedIntegerCombineLegacyPass(PackedIntegerCombineOptions Options)
+      : FunctionPass(ID), Options(Options) {}
 
-  bool runOnFunction(Function &F) override { return runImpl(F); }
+  bool runOnFunction(Function &F) override { return runImpl(F, Options); }
 };
 char PackedIntegerCombineLegacyPass::ID = 0;
 
@@ -1920,7 +1839,7 @@ char PackedIntegerCombineLegacyPass::ID = 0;
 
 PreservedAnalyses PackedIntegerCombinePass::run(Function &F,
                                                 FunctionAnalysisManager &AM) {
-  if (!runImpl(F))
+  if (!runImpl(F, Options))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
@@ -1928,9 +1847,22 @@ PreservedAnalyses PackedIntegerCombinePass::run(Function &F,
   return PA;
 }
 
+void PackedIntegerCombinePass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<PackedIntegerCombinePass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << '<';
+  if (Options.AggressiveRewriting)
+    OS << "aggressive;";
+  OS << "max-iterations=" << Options.MaxCollectionIterations << '>';
+}
+
 INITIALIZE_PASS(PackedIntegerCombineLegacyPass, DEBUG_TYPE,
                 "Packed Integer Combine", false, false)
 
-FunctionPass *llvm::createPackedIntegerCombinePass() {
-  return new PackedIntegerCombineLegacyPass();
+FunctionPass *
+llvm::createPackedIntegerCombinePass(unsigned MaxCollectionIterations,
+                                     bool AggressiveRewriting) {
+  return new PackedIntegerCombineLegacyPass(
+      {MaxCollectionIterations, AggressiveRewriting});
 }
diff --git a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
index e7b6a6bc66fa1..c3804eeff6891 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/instructions.ll
@@ -1,53 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
-
-;; u0xff00ff00 = -16711936
-;; u0x00ff00ff = 16711935
-define i32 @add.0(i32 %a, i32 %b) {
-; LAZY-LABEL: define i32 @add.0(
-; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711936
-; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
-; LAZY-NEXT:    [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
-; LAZY-NEXT:    ret i32 [[ADD]]
-;
-; AGGRESSIVE-LABEL: define i32 @add.0(
-; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; AGGRESSIVE-NEXT:    [[B_MASK2:%.*]] = and i32 [[B]], 16711935
-; AGGRESSIVE-NEXT:    [[A_MASK4:%.*]] = and i32 [[A]], -16711936
-; AGGRESSIVE-NEXT:    [[ADD_MERGE:%.*]] = or disjoint i32 [[B_MASK2]], [[A_MASK4]]
-; AGGRESSIVE-NEXT:    ret i32 [[ADD_MERGE]]
-;
-  %a.mask = and i32 %a, u0xff00ff00
-  %b.mask = and i32 %b, u0x00ff00ff
-  %add = add i32 %a.mask, %b.mask
-  ret i32 %add
-}
-
-;; u0xff00ffff = -16711681
-;; u0x00ff00ff = 16711935
-;; Nothing happens in this case because of the overlapping bytes.
-define i32 @add.1(i32 %a, i32 %b) {
-; LAZY-LABEL: define i32 @add.1(
-; LAZY-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; LAZY-NEXT:    [[A_MASK:%.*]] = and i32 [[A]], -16711681
-; LAZY-NEXT:    [[B_MASK:%.*]] = and i32 [[B]], 16711935
-; LAZY-NEXT:    [[ADD:%.*]] = add i32 [[A_MASK]], [[B_MASK]]
-; LAZY-NEXT:    ret i32 [[ADD]]
-;
-; AGGRESSIVE-LABEL: define i32 @add.1(
-; AGGRESSIVE-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; AGGRESSIVE-NEXT:    [[A_MASK2:%.*]] = and i32 [[A]], -16711681
-; AGGRESSIVE-NEXT:    [[B_MASK4:%.*]] = and i32 [[B]], 16711935
-; AGGRESSIVE-NEXT:    [[ADD:%.*]] = add i32 [[A_MASK2]], [[B_MASK4]]
-; AGGRESSIVE-NEXT:    ret i32 [[ADD]]
-;
-  %a.mask = and i32 %a, u0xff00ffff
-  %b.mask = and i32 %b, u0x00ff00ff
-  %add = add i32 %a.mask, %b.mask
-  ret i32 %add
-}
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
 
 define i32 @and.0(i32 %a, i32 %b) {
 ; LAZY-LABEL: define i32 @and.0(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
index fe08ce93719d0..274d01f4ad70f 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2int.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
 
 define i16 @top_bytes(i32 %a, i32 %b) {
 ; LAZY-LABEL: define i16 @top_bytes(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
index e4c1538826e0f..ad4424a84a6cc 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/int2vec.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
 
 define <2 x i8> @top_bytes(i32 %a, i32 %b) {
 ; LAZY-LABEL: define <2 x i8> @top_bytes(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
index 22ee7fcf5b4c6..624f2b4f8acac 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2int.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
 
 define i32 @extract_i32(<4 x i8> %from) {
 ; LAZY-LABEL: define i32 @extract_i32(
diff --git a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
index 5baefc7fb6cda..b6dbaa546c718 100644
--- a/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
+++ b/llvm/test/Transforms/PackedIntegerCombine/vec2vec.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=packedintcombine %s | FileCheck %s --check-prefix=LAZY
-; RUN: opt -S -passes=packedintcombine -packedint-aggressive-rewriter %s | FileCheck %s --check-prefix=AGGRESSIVE
+; RUN: opt -S -passes=packed-integer-combine %s | FileCheck %s --check-prefix=LAZY
+; RUN: opt -S -passes="packed-integer-combine<aggressive>" %s | FileCheck %s --check-prefix=AGGRESSIVE
 
 define <4 x i8> @obtain_v4i8(<2 x i16> %from) {
 ; LAZY-LABEL: define <4 x i8> @obtain_v4i8(