From 1c79b90c2ba0b5f4a137d7769f5939fc352c3fef Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 26 Jun 2025 12:00:42 -0700
Subject: [PATCH 1/2] [CGP] Reassociate GEPs to separate scalar and vector
 indexing

If we have a vector result GEP which has a scalar base pointer, and a
mixture of scalar and vector indices, we can group the indices into
two GEPs (with appropriate zero indices inserted to null terms in
each) to effectively reassociate the final addressing math.

There's a restriction here in that we can only zero terms for which
replacing a non-zero index with a zero index doesn't change semantics
of the GEP itself.  Doing so for arbitrary struct typss is unsound.

This is an extension to the existing logic which tries to seperate
a scalar base and a single vector operand.  It's not quite a
generalization because of the struct indexing restriction.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 261 ++++++++++++------
 llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll | 101 +++----
 .../X86/gather-scatter-opt-inseltpoison.ll    |  22 +-
 3 files changed, 229 insertions(+), 155 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9bbb89e37865d..2138862f5335f 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -430,6 +430,10 @@ class CodeGenPrepare {
   bool optimizeInst(Instruction *I, ModifyDT &ModifiedDT);
   bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
                           unsigned AddrSpace);
+  Value *splitLastVectorIndex(Instruction *MemoryInst,
+                              const GetElementPtrInst *GEP);
+  Value *reassociateVectorOps(Instruction *MemoryInst,
+                              const GetElementPtrInst *GEP);
   bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
   bool optimizeInlineAsmInst(CallInst *CS);
   bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
@@ -6235,23 +6239,177 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   return true;
 }
 
+Value *CodeGenPrepare::splitLastVectorIndex(Instruction *MemoryInst,
+                                            const GetElementPtrInst *GEP) {
+  SmallVector<Value *, 2> Ops(GEP->operands());
+
+  bool RewriteGEP = false;
+
+  if (Ops[0]->getType()->isVectorTy()) {
+    Ops[0] = getSplatValue(Ops[0]);
+    if (!Ops[0])
+      return nullptr;
+    RewriteGEP = true;
+  }
+
+  unsigned FinalIndex = Ops.size() - 1;
+
+  // Ensure all but the last index is 0.
+  // FIXME: This isn't strictly required. All that's required is that they are
+  // all scalars or splats.
+  for (unsigned i = 1; i < FinalIndex; ++i) {
+    auto *C = dyn_cast<Constant>(Ops[i]);
+    if (!C)
+      return nullptr;
+    if (isa<VectorType>(C->getType()))
+      C = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(C);
+    if (!CI || !CI->isZero())
+      return nullptr;
+    // Scalarize the index if needed.
+    Ops[i] = CI;
+  }
+
+  // Try to scalarize the final index.
+  if (Ops[FinalIndex]->getType()->isVectorTy()) {
+    if (Value *V = getSplatValue(Ops[FinalIndex])) {
+      auto *C = dyn_cast<ConstantInt>(V);
+      // Don't scalarize all zeros vector.
+      if (!C || !C->isZero()) {
+        Ops[FinalIndex] = V;
+        RewriteGEP = true;
+      }
+    }
+  }
+
+  // If we made any changes or the we have extra operands, we need to generate
+  // new instructions.
+  if (!RewriteGEP && Ops.size() == 2)
+    return nullptr;
+
+  auto NumElts = cast<VectorType>(GEP->getType())->getElementCount();
+
+  IRBuilder<> Builder(MemoryInst);
+
+  Type *SourceTy = GEP->getSourceElementType();
+  Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
+
+  // If the final index isn't a vector, emit a scalar GEP containing all ops
+  // and a vector GEP with all zeroes final index.
+  if (!Ops[FinalIndex]->getType()->isVectorTy()) {
+    Value *NewAddr = Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front());
+    auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
+    auto *SecondTy = GetElementPtrInst::getIndexedType(
+                                                       SourceTy, ArrayRef(Ops).drop_front());
+    return Builder.CreateGEP(SecondTy, NewAddr, Constant::getNullValue(IndexTy));
+  }
+
+  Value *Base = Ops[0];
+  Value *Index = Ops[FinalIndex];
+
+  // Create a scalar GEP if there are more than 2 operands.
+  if (Ops.size() != 2) {
+    // Replace the last index with 0.
+    Ops[FinalIndex] =
+      Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType());
+    Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front());
+    SourceTy = GetElementPtrInst::getIndexedType(
+                                                 SourceTy, ArrayRef(Ops).drop_front());
+  }
+
+  // Now create the GEP with scalar pointer and vector index.
+  return Builder.CreateGEP(SourceTy, Base, Index);
+}
+
+/// The addressing performed by a GEP is simply a set of adds.  Adds are
+/// by definition reassociable.  If the indices of a this gep contain
+/// both scalar and vector indices, we can split the result into two
+/// GEPs (zeroing out a subset of indices in each) while computing the
+/// same results.  We do have to be careful to only zero indices where
+/// that doesn't change the type traversed (i.e. not for structs). Doing
+/// so has the effect of grouping all vector arithmetic after all scalar
+/// arithmetic, and encourages scalar base identification.
+Value *CodeGenPrepare::reassociateVectorOps(Instruction *MemoryInst,
+                                            const GetElementPtrInst *GEP) {
+  SmallVector<Value *, 2> Ops(GEP->operands());
+  const unsigned E = Ops.size();
+
+  if (Ops[0]->getType()->isVectorTy()) {
+    Ops[0] = getSplatValue(Ops[0]);
+    if (!Ops[0])
+      return nullptr;
+  }
+
+  // Check for at least one non-zero scalar index and one vector index each,
+  // and aren't trying to iterate through a struct type where changing the
+  // index to zero would be illegal.
+  bool HasNonZeroScalar = false, HasNonZeroVector = false;
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1; i < E; ++i, ++GTI) {
+    if (Value *V = getSplatValue(Ops[i]))
+      Ops[i] = V;
+
+    // Zeros don't count in terms of splitting, and zero struct
+    // indices are fine
+    if (match(Ops[i], m_Zero()))
+      continue;
+
+    if (GTI.getStructTypeOrNull())
+      return nullptr;
+
+    if (isa<VectorType>(Ops[i]->getType()))
+      HasNonZeroVector = true;
+    else
+      HasNonZeroScalar = true;
+  }
+
+  if (!HasNonZeroVector || !HasNonZeroScalar)
+    return nullptr;
+
+  SmallVector<Value *, 2> ScalarOps(Ops);
+  SmallVector<Value *, 2> VectorOps(Ops);
+  for (unsigned i = 1; i < E; i++) {
+    auto *IdxTy = Ops[i]->getType()->getScalarType();
+    auto *ScalarZero = Constant::getNullValue(IdxTy);
+    if (isa<VectorType>(Ops[i]->getType()))
+      ScalarOps[i] = ScalarZero;
+    else
+      VectorOps[i] = ScalarZero;
+  }
+
+  IRBuilder<> Builder(MemoryInst);
+  Type *SourceTy = GEP->getSourceElementType();
+  Value *Base = Ops[0];
+  Base =
+      Builder.CreateGEP(SourceTy, Base, ArrayRef(ScalarOps).drop_front());
+  Base =
+      Builder.CreateGEP(SourceTy, Base, ArrayRef(VectorOps).drop_front());
+  return Base;
+}
+
+
 /// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
 /// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
-/// only handle a 2 operand GEP in the same basic block or a splat constant
-/// vector. The 2 operands to the GEP must have a scalar pointer and a vector
-/// index.
+/// only handle a GEP with a scalar pointer and one non-zero vector index in
+/// the same basic block or a splat constant vector.  We use GEP with a
+/// vector zeroinitializer input for canonical splat as SelectionDAG has
+/// trouble with splats which might be in different blocks.
 ///
 /// If the existing GEP has a vector base pointer that is splat, we can look
 /// through the splat to find the scalar pointer. If we can't find a scalar
 /// pointer there's nothing we can do.
 ///
-/// If we have a GEP with more than 2 indices where the middle indices are all
-/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
-///
-/// If the final index isn't a vector or is a splat, we can emit a scalar GEP
-/// followed by a GEP with an all zeroes vector index. This will enable
-/// SelectionDAGBuilder to use the scalar GEP as the uniform base and have a
-/// zero index.
+/// With this goal in mind, we perform three related transforms:
+/// 1) If the GEP could be entirely scalarized, we do so and emit a separate
+///    getelementptr p, zeroinitializer to splat the result.
+/// 2) If the GEP can be scalarized all except for the last index, we split
+///    the gep into a scalar prefix, and a base + vectoridx GEP for the
+///    final index.
+/// 3) If the GEP has a mixture of scalar and vector indices, we split the
+///    GEP into a pair of GEPs with some of the indices zeroed out in each.
+///    This essentially reassociates the GEP such that all scalar addressing
+///    is done before all vector addressing.  This transform has restrictions
+///    when indexing through struct types.
 bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
                                                Value *Ptr) {
   Value *NewAddr;
@@ -6266,85 +6424,12 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
     if (MemoryInst->getParent() != GEP->getParent())
       return false;
 
-    SmallVector<Value *, 2> Ops(GEP->operands());
-
-    bool RewriteGEP = false;
-
-    if (Ops[0]->getType()->isVectorTy()) {
-      Ops[0] = getSplatValue(Ops[0]);
-      if (!Ops[0])
-        return false;
-      RewriteGEP = true;
-    }
-
-    unsigned FinalIndex = Ops.size() - 1;
-
-    // Ensure all but the last index is 0.
-    // FIXME: This isn't strictly required. All that's required is that they are
-    // all scalars or splats.
-    for (unsigned i = 1; i < FinalIndex; ++i) {
-      auto *C = dyn_cast<Constant>(Ops[i]);
-      if (!C)
-        return false;
-      if (isa<VectorType>(C->getType()))
-        C = C->getSplatValue();
-      auto *CI = dyn_cast_or_null<ConstantInt>(C);
-      if (!CI || !CI->isZero())
-        return false;
-      // Scalarize the index if needed.
-      Ops[i] = CI;
-    }
-
-    // Try to scalarize the final index.
-    if (Ops[FinalIndex]->getType()->isVectorTy()) {
-      if (Value *V = getSplatValue(Ops[FinalIndex])) {
-        auto *C = dyn_cast<ConstantInt>(V);
-        // Don't scalarize all zeros vector.
-        if (!C || !C->isZero()) {
-          Ops[FinalIndex] = V;
-          RewriteGEP = true;
-        }
-      }
-    }
-
-    // If we made any changes or the we have extra operands, we need to generate
-    // new instructions.
-    if (!RewriteGEP && Ops.size() == 2)
+    if (auto *V = splitLastVectorIndex(MemoryInst, GEP))
+      NewAddr = V;
+    else if (auto *V = reassociateVectorOps(MemoryInst, GEP))
+      NewAddr = V;
+    else
       return false;
-
-    auto NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
-
-    IRBuilder<> Builder(MemoryInst);
-
-    Type *SourceTy = GEP->getSourceElementType();
-    Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
-
-    // If the final index isn't a vector, emit a scalar GEP containing all ops
-    // and a vector GEP with all zeroes final index.
-    if (!Ops[FinalIndex]->getType()->isVectorTy()) {
-      NewAddr = Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front());
-      auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
-      auto *SecondTy = GetElementPtrInst::getIndexedType(
-          SourceTy, ArrayRef(Ops).drop_front());
-      NewAddr =
-          Builder.CreateGEP(SecondTy, NewAddr, Constant::getNullValue(IndexTy));
-    } else {
-      Value *Base = Ops[0];
-      Value *Index = Ops[FinalIndex];
-
-      // Create a scalar GEP if there are more than 2 operands.
-      if (Ops.size() != 2) {
-        // Replace the last index with 0.
-        Ops[FinalIndex] =
-            Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType());
-        Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front());
-        SourceTy = GetElementPtrInst::getIndexedType(
-            SourceTy, ArrayRef(Ops).drop_front());
-      }
-
-      // Now create the GEP with scalar pointer and vector index.
-      NewAddr = Builder.CreateGEP(SourceTy, Base, Index);
-    }
   } else if (!isa<Constant>(Ptr)) {
     // Not a GEP, maybe its a splat and we can create a GEP to enable
     // SelectionDAGBuilder to use it as a uniform base.
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 3057ee9293992..d728262b599f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -2377,26 +2377,21 @@ define <vscale x 1 x i8> @mgather_baseidx_zext_nxv1i1_nxv1i8(ptr %base, <vscale
 define <4 x i32> @scalar_prefix(ptr %base, i32 signext %index, <4 x i32> %vecidx) {
 ; RV32-LABEL: scalar_prefix:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 10
+; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsll.vi v9, v9, 10
-; RV32-NEXT:    vadd.vx v9, v9, a0
 ; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vluxei32.v v8, (zero), v8
+; RV32-NEXT:    vluxei32.v v8, (a0), v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: scalar_prefix:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 1024
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    vwmaccsu.vx v10, a1, v9
-; RV64-NEXT:    li a0, 4
-; RV64-NEXT:    vwmaccus.vx v10, a0, v8
-; RV64-NEXT:    vluxei64.v v8, (zero), v10
+; RV64-NEXT:    slli a1, a1, 10
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    li a1, 4
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vwmulsu.vx v10, v8, a1
+; RV64-NEXT:    vluxei64.v v8, (a0), v10
 ; RV64-NEXT:    ret
   %gep = getelementptr [256 x i32], ptr %base, i32 %index, <4 x i32> %vecidx
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
@@ -2406,26 +2401,22 @@ define <4 x i32> @scalar_prefix(ptr %base, i32 signext %index, <4 x i32> %vecidx
 define <4 x i32> @scalar_prefix_with_splat(ptr %base, i32 %index, <4 x i32> %vecidx) {
 ; RV32-LABEL: scalar_prefix_with_splat:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 10
+; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsll.vi v9, v9, 10
-; RV32-NEXT:    vadd.vx v9, v9, a0
 ; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vluxei32.v v8, (zero), v8
+; RV32-NEXT:    vluxei32.v v8, (a0), v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: scalar_prefix_with_splat:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a2, 1024
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    vwmaccsu.vx v10, a1, v9
-; RV64-NEXT:    li a0, 4
-; RV64-NEXT:    vwmaccus.vx v10, a0, v8
-; RV64-NEXT:    vluxei64.v v8, (zero), v10
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    slli a1, a1, 10
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    li a1, 4
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vwmulsu.vx v10, v8, a1
+; RV64-NEXT:    vluxei64.v v8, (a0), v10
 ; RV64-NEXT:    ret
   %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -2447,11 +2438,11 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i32> %vecidx
 ;
 ; RV64-LABEL: scalar_prefix_with_constant_splat:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 5
+; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    li a1, 4
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vwmulsu.vx v10, v8, a1
-; RV64-NEXT:    lui a1, 5
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vluxei64.v v8, (a0), v10
 ; RV64-NEXT:    ret
   %gep = getelementptr [256 x i32], ptr %base, <4 x i32> splat (i32 20), <4 x i32> %vecidx
@@ -2462,25 +2453,22 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i32> %vecidx
 define <4 x i32> @reassociate(ptr %base, i32 %index, <4 x i32> %vecidx) {
 ; RV32-LABEL: reassociate:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 10
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vadd.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v9, v9, 2
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vluxei32.v v8, (zero), v8
+; RV32-NEXT:    vluxei32.v v8, (a0), v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reassociate:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    li a0, 1024
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vwmaccus.vx v10, a0, v8
-; RV64-NEXT:    vmv.v.i v8, 4
-; RV64-NEXT:    vwmaccsu.vx v10, a1, v8
-; RV64-NEXT:    vluxei64.v v8, (zero), v10
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    li a1, 1024
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vwmulsu.vx v10, v8, a1
+; RV64-NEXT:    vluxei64.v v8, (a0), v10
 ; RV64-NEXT:    ret
   %gep = getelementptr [256 x i32], ptr %base, <4 x i32> %vecidx, i32 %index
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
@@ -2490,25 +2478,22 @@ define <4 x i32> @reassociate(ptr %base, i32 %index, <4 x i32> %vecidx) {
 define <4 x i32> @reassociate_with_splat(ptr %base, i32 %index, <4 x i32> %vecidx) {
 ; RV32-LABEL: reassociate_with_splat:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsll.vi v8, v8, 10
-; RV32-NEXT:    vadd.vx v8, v8, a0
-; RV32-NEXT:    vsll.vi v9, v9, 2
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    vluxei32.v v8, (zero), v8
+; RV32-NEXT:    vluxei32.v v8, (a0), v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reassociate_with_splat:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    li a0, 1024
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vwmaccus.vx v10, a0, v8
-; RV64-NEXT:    vmv.v.i v8, 4
-; RV64-NEXT:    vwmaccsu.vx v10, a1, v8
-; RV64-NEXT:    vluxei64.v v8, (zero), v10
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    li a1, 1024
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vwmulsu.vx v10, v8, a1
+; RV64-NEXT:    vluxei64.v v8, (a0), v10
 ; RV64-NEXT:    ret
   %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -2521,18 +2506,18 @@ define <4 x i32> @reassociate_with_splat(ptr %base, i32 %index, <4 x i32> %vecid
 define <4 x i32> @reassociate_with_constant_splat(ptr %base, i32 %index, <4 x i32> %vecidx) {
 ; RV32-LABEL: reassociate_with_constant_splat:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi a0, a0, 80
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 10
-; RV32-NEXT:    addi a0, a0, 80
 ; RV32-NEXT:    vluxei32.v v8, (a0), v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reassociate_with_constant_splat:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi a0, a0, 80
 ; RV64-NEXT:    li a1, 1024
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vwmulsu.vx v10, v8, a1
-; RV64-NEXT:    addi a0, a0, 80
 ; RV64-NEXT:    vluxei64.v v8, (a0), v10
 ; RV64-NEXT:    ret
   %gep = getelementptr [256 x i32], ptr %base, <4 x i32> %vecidx, <4 x i32> splat (i32 20)
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
index e27d5d772a7a4..09932fe403e4e 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
@@ -112,7 +112,9 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 
 define <4 x i32> @scalar_prefix(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @scalar_prefix(
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], <4 x i64> [[VECIDX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP3]], <4 x i64> [[VECIDX:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -123,9 +125,9 @@ define <4 x i32> @scalar_prefix(ptr %base, i64 %index, <4 x i64> %vecidx) {
 
 define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @scalar_prefix_with_splat(
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> [[VECIDX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP3]], <4 x i64> [[VECIDX:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -139,7 +141,9 @@ define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vec
 
 define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @scalar_prefix_with_constant_splat(
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> splat (i64 20), <4 x i64> [[VECIDX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 20, i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP3]], <4 x i64> [[VECIDX:%.*]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -150,7 +154,8 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx
 
 define <4 x i32> @reassociate(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @reassociate(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 0, i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[TMP1]], <4 x i64> [[VECIDX:%.*]], i64 0
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -161,9 +166,8 @@ define <4 x i32> @reassociate(ptr %base, i64 %index, <4 x i64> %vecidx) {
 
 define <4 x i32> @reassociate_with_splat(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @reassociate_with_splat(
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], <4 x i64> [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 0, i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[TMP1]], <4 x i64> [[VECIDX:%.*]], i64 0
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;

From 13a4969bfc6a29ab2d10aefee01e8d390b697634 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 30 Jun 2025 08:59:48 -0700
Subject: [PATCH 2/2] clang-format

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 2138862f5335f..d4319c5c3ac69 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6297,11 +6297,13 @@ Value *CodeGenPrepare::splitLastVectorIndex(Instruction *MemoryInst,
   // If the final index isn't a vector, emit a scalar GEP containing all ops
   // and a vector GEP with all zeroes final index.
   if (!Ops[FinalIndex]->getType()->isVectorTy()) {
-    Value *NewAddr = Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front());
+    Value *NewAddr =
+        Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front());
     auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
-    auto *SecondTy = GetElementPtrInst::getIndexedType(
-                                                       SourceTy, ArrayRef(Ops).drop_front());
-    return Builder.CreateGEP(SecondTy, NewAddr, Constant::getNullValue(IndexTy));
+    auto *SecondTy =
+        GetElementPtrInst::getIndexedType(SourceTy, ArrayRef(Ops).drop_front());
+    return Builder.CreateGEP(SecondTy, NewAddr,
+                             Constant::getNullValue(IndexTy));
   }
 
   Value *Base = Ops[0];
@@ -6311,10 +6313,10 @@ Value *CodeGenPrepare::splitLastVectorIndex(Instruction *MemoryInst,
   if (Ops.size() != 2) {
     // Replace the last index with 0.
     Ops[FinalIndex] =
-      Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType());
+        Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType());
     Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front());
-    SourceTy = GetElementPtrInst::getIndexedType(
-                                                 SourceTy, ArrayRef(Ops).drop_front());
+    SourceTy =
+        GetElementPtrInst::getIndexedType(SourceTy, ArrayRef(Ops).drop_front());
   }
 
   // Now create the GEP with scalar pointer and vector index.
@@ -6380,14 +6382,11 @@ Value *CodeGenPrepare::reassociateVectorOps(Instruction *MemoryInst,
   IRBuilder<> Builder(MemoryInst);
   Type *SourceTy = GEP->getSourceElementType();
   Value *Base = Ops[0];
-  Base =
-      Builder.CreateGEP(SourceTy, Base, ArrayRef(ScalarOps).drop_front());
-  Base =
-      Builder.CreateGEP(SourceTy, Base, ArrayRef(VectorOps).drop_front());
+  Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(ScalarOps).drop_front());
+  Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(VectorOps).drop_front());
   return Base;
 }
 
-
 /// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
 /// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
 /// only handle a GEP with a scalar pointer and one non-zero vector index in