From 7af365c0ec0b693abfd78b65280721572120dcd3 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Mon, 7 Jul 2025 14:29:22 -0700 Subject: [PATCH 1/2] [IA] Support deinterleave intrinsics w/ fewer than N extracts For the fixed vector cases, we already support this, but the deinterleave intrinsic cases (primary used by scalable vectors) didn't. Supporting it requires plumbing through the Factor separately from the extracts, as there can now be fewer extracts than the Factor. Note that the fixed vector path handles this slightly differently - it uses the shuffle and indices scheme to achieve the same thing. --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 +- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 22 +++++--- .../Target/AArch64/AArch64ISelLowering.cpp | 20 ++++--- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 5 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 21 +++---- llvm/lib/Target/RISCV/RISCVISelLowering.h | 8 ++- .../RISCV/rvv/vector-deinterleave-load.ll | 55 ++----------------- 7 files changed, 53 insertions(+), 84 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index fee94cc167363..eb7fce106dba1 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3230,7 +3230,8 @@ class LLVM_ABI TargetLoweringBase { /// \p Mask is a mask value /// \p DeinterleaveRes is a list of deinterleaved results. virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const { + ArrayRef DeinterleaveRes, + unsigned Factor) const { return false; } @@ -3253,7 +3254,8 @@ class LLVM_ABI TargetLoweringBase { /// \p DeinterleaveValues contains the deinterleaved values. virtual bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI, - ArrayRef DeinterleaveValues) const { + ArrayRef DeinterleaveValues, + unsigned Factor) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 9c4c86cebe7e5..04daa7656f74e 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -377,7 +377,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( SmallVector ShuffleValues(Factor, nullptr); for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; - if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) + if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues, Factor)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; } else { @@ -670,9 +670,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; const unsigned Factor = getIntrinsicFactor(DI); - if (!DI->hasNUses(Factor)) - return false; - SmallVector DeinterleaveValues(Factor); + SmallVector DeinterleaveValues(Factor, nullptr); + Value *LastFactor = nullptr; for (auto *User : DI->users()) { auto *Extract = dyn_cast(User); if (!Extract || Extract->getNumIndices() != 1) @@ -681,15 +680,19 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( if (DeinterleaveValues[Idx]) return false; DeinterleaveValues[Idx] = Extract; + LastFactor = Extract; } + if (!LastFactor) + return false; + if (auto *VPLoad = dyn_cast(LoadedVal)) { if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) return false; // Check mask operand. Handle both all-true/false and interleaved mask. Value *WideMask = VPLoad->getOperand(1); - Value *Mask = getMask(WideMask, Factor, - cast(DeinterleaveValues[0]->getType())); + Value *Mask = + getMask(WideMask, Factor, cast(LastFactor->getType())); if (!Mask) return false; @@ -698,7 +701,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues)) + if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues, Factor)) return false; } else { @@ -710,12 +713,13 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( << " and factor = " << Factor << "\n"); // Try and match this with target specific intrinsics. - if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) + if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues, Factor)) return false; } for (Value *V : DeinterleaveValues) - DeadInsts.insert(cast(V)); + if (V) + DeadInsts.insert(cast(V)); DeadInsts.insert(DI); // We now have a target-specific load, so delete the old one. DeadInsts.insert(cast(LoadedVal)); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cdb68684b3856..30a9e66fec88d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17456,14 +17456,16 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, } bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef DeinterleavedValues) const { - unsigned Factor = DeinterleavedValues.size(); + LoadInst *LI, ArrayRef DeinterleavedValues, + unsigned Factor) const { if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); return false; } - VectorType *VTy = cast(DeinterleavedValues[0]->getType()); + Value *FirstActive = *llvm::find_if(DeinterleavedValues, + [](Value *V) { return V != nullptr; }); + VectorType *VTy = cast(FirstActive->getType()); const DataLayout &DL = LI->getModule()->getDataLayout(); bool UseScalable; @@ -17512,8 +17514,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump()); } // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned J = 0; J < Factor; ++J) - DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]); + for (unsigned J = 0; J < Factor; ++J) { + if (DeinterleavedValues[J]) + DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]); + } } else { Value *Result; if (UseScalable) @@ -17522,8 +17526,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 for (unsigned I = 0; I < Factor; I++) { - Value *NewExtract = Builder.CreateExtractValue(Result, I); - DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); + if (DeinterleavedValues[I]) { + Value *NewExtract = Builder.CreateExtractValue(Result, I); + DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); + } } } return true; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 89f90ee2b7707..29ec0d3ffc81f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -218,8 +218,9 @@ class AArch64TargetLowering : public TargetLowering { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI, + ArrayRef DeinterleaveValues, + unsigned Factor) const override; bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5126ab6c31c28..9f84dddba0ce5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -24255,15 +24255,16 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, } bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef DeinterleaveValues) const { - unsigned Factor = DeinterleaveValues.size(); + LoadInst *LI, ArrayRef DeinterleaveValues, unsigned Factor) const { if (Factor > 8) return false; assert(LI->isSimple()); IRBuilder<> Builder(LI); - auto *ResVTy = cast(DeinterleaveValues[0]->getType()); + Value *FirstActive = *llvm::find_if(DeinterleaveValues, + [](Value *V) { return V != nullptr; }); + VectorType *ResVTy = cast(FirstActive->getType()); const DataLayout &DL = LI->getDataLayout(); @@ -24314,6 +24315,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( } for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { + if (!DIV) + continue; // We have to create a brand new ExtractValue to replace each // of these old ExtractValue instructions. Value *NewEV = @@ -24437,17 +24440,15 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// dealing with factor of 2 (extractvalue is still required for most of other /// factors though). bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveResults) const { + VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveResults, + unsigned Factor) const { assert(Mask && "Expect a valid mask"); assert(Load->getIntrinsicID() == Intrinsic::vp_load && "Unexpected intrinsic"); - const unsigned Factor = DeinterleaveResults.size(); - - auto *VTy = dyn_cast(DeinterleaveResults[0]->getType()); - if (!VTy) - return false; + Value *FirstActive = *llvm::find_if(DeinterleaveResults, + [](Value *V) { return V != nullptr; }); + VectorType *VTy = cast(FirstActive->getType()); auto &DL = Load->getModule()->getDataLayout(); Align Alignment = Load->getParamAlign(0).value_or( diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index f67d7f155c9d0..a1b283e35074a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -437,14 +437,16 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI, + ArrayRef DeinterleaveValues, + unsigned Factor) const override; bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const override; + ArrayRef DeinterleaveRes, + unsigned Factor) const override; bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef InterleaveOps) const override; diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index a353e8e4398fa..9af92aa995f1f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -52,9 +52,8 @@ define {, } @vector_deinterleave_load_nxv16i define @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8_oneactive: ; CHECK: # %bb.0: -; CHECK-NEXT: vl4r.v v12, (a0) -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv32i8( %vec) @@ -65,9 +64,8 @@ define @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive(pt define @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive2(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8_oneactive2: ; CHECK: # %bb.0: -; CHECK-NEXT: vl4r.v v12, (a0) -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v12, 8 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg2e8.v v6, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv32i8( %vec) @@ -409,23 +407,8 @@ define { , , , @vector_deinterleave_load_factor4_oneactive(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: vl4r.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call { , , , } @llvm.vector.deinterleave4( %vec) @@ -436,23 +419,8 @@ define @vector_deinterleave_load_factor4_oneactive(ptr %p) { define @vector_deinterleave_load_factor4_oneactive2(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: vl4r.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg4e8.v v5, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call { , , , } @llvm.vector.deinterleave4( %vec) @@ -463,23 +431,8 @@ define @vector_deinterleave_load_factor4_oneactive2(ptr %p) { define { , , , } @vector_deinterleave_load_factor4_twoactive(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4_twoactive: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: vl4r.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call { , , , } @llvm.vector.deinterleave4( %vec) From f4f9ffe087d3d54de374f7b01648b422a074da6c Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 8 Jul 2025 10:23:49 -0700 Subject: [PATCH 2/2] Clang-format --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9f84dddba0ce5..b5e2a58997664 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -24262,8 +24262,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( assert(LI->isSimple()); IRBuilder<> Builder(LI); - Value *FirstActive = *llvm::find_if(DeinterleaveValues, - [](Value *V) { return V != nullptr; }); + Value *FirstActive = + *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); VectorType *ResVTy = cast(FirstActive->getType()); const DataLayout &DL = LI->getDataLayout();