Skip to content

[IA] Support deinterleave intrinsics w/ fewer than N extracts #147572

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3230,7 +3230,8 @@ class LLVM_ABI TargetLoweringBase {
/// \p Mask is a mask value
/// \p DeinterleaveRes is a list of deinterleaved results.
virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
ArrayRef<Value *> DeinterleaveRes) const {
ArrayRef<Value *> DeinterleaveRes,
unsigned Factor) const {
return false;
}

Expand All @@ -3253,7 +3254,8 @@ class LLVM_ABI TargetLoweringBase {
/// \p DeinterleaveValues contains the deinterleaved values.
virtual bool
lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
ArrayRef<Value *> DeinterleaveValues) const {
ArrayRef<Value *> DeinterleaveValues,
unsigned Factor) const {
return false;
}

Expand Down
22 changes: 13 additions & 9 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues, Factor))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
} else {
Expand Down Expand Up @@ -670,9 +670,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
return false;

const unsigned Factor = getIntrinsicFactor(DI);
if (!DI->hasNUses(Factor))
return false;
SmallVector<Value *, 8> DeinterleaveValues(Factor);
SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
Value *LastFactor = nullptr;
for (auto *User : DI->users()) {
auto *Extract = dyn_cast<ExtractValueInst>(User);
if (!Extract || Extract->getNumIndices() != 1)
Expand All @@ -681,15 +680,19 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
if (DeinterleaveValues[Idx])
return false;
DeinterleaveValues[Idx] = Extract;
LastFactor = Extract;
}

if (!LastFactor)
return false;

if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
Value *WideMask = VPLoad->getOperand(1);
Value *Mask = getMask(WideMask, Factor,
cast<VectorType>(DeinterleaveValues[0]->getType()));
Value *Mask =
getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
if (!Mask)
return false;

Expand All @@ -698,7 +701,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(

// Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
// TLI function to emit target-specific interleaved instruction.
if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues, Factor))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just plumb through LastFactor->getType() to save TLIs from having to do find_if on the DeinterleaveValues each time?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to be looking at reworking this interface in a couple of upcoming patches, I'll try this and see how it works out.

return false;

} else {
Expand All @@ -710,12 +713,13 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
<< " and factor = " << Factor << "\n");

// Try and match this with target specific intrinsics.
if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues, Factor))
return false;
}

for (Value *V : DeinterleaveValues)
DeadInsts.insert(cast<Instruction>(V));
if (V)
DeadInsts.insert(cast<Instruction>(V));
DeadInsts.insert(DI);
// We now have a target-specific load, so delete the old one.
DeadInsts.insert(cast<Instruction>(LoadedVal));
Expand Down
20 changes: 13 additions & 7 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17456,14 +17456,16 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}

bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
unsigned Factor = DeinterleavedValues.size();
LoadInst *LI, ArrayRef<Value *> DeinterleavedValues,
unsigned Factor) const {
if (Factor != 2 && Factor != 4) {
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
return false;
}

VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
Value *FirstActive = *llvm::find_if(DeinterleavedValues,
[](Value *V) { return V != nullptr; });
VectorType *VTy = cast<VectorType>(FirstActive->getType());

const DataLayout &DL = LI->getModule()->getDataLayout();
bool UseScalable;
Expand Down Expand Up @@ -17512,8 +17514,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
}
// Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
for (unsigned J = 0; J < Factor; ++J)
DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
for (unsigned J = 0; J < Factor; ++J) {
if (DeinterleavedValues[J])
DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
}
} else {
Value *Result;
if (UseScalable)
Expand All @@ -17522,8 +17526,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
// Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
for (unsigned I = 0; I < Factor; I++) {
Value *NewExtract = Builder.CreateExtractValue(Result, I);
DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
if (DeinterleavedValues[I]) {
Value *NewExtract = Builder.CreateExtractValue(Result, I);
DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
}
}
}
return true;
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,9 @@ class AArch64TargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool lowerDeinterleaveIntrinsicToLoad(
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
ArrayRef<Value *> DeinterleaveValues,
unsigned Factor) const override;

bool lowerInterleaveIntrinsicToStore(
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
Expand Down
21 changes: 11 additions & 10 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24255,15 +24255,16 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}

bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
unsigned Factor = DeinterleaveValues.size();
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues, unsigned Factor) const {
if (Factor > 8)
return false;

assert(LI->isSimple());
IRBuilder<> Builder(LI);

auto *ResVTy = cast<VectorType>(DeinterleaveValues[0]->getType());
Value *FirstActive =
*llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
VectorType *ResVTy = cast<VectorType>(FirstActive->getType());

const DataLayout &DL = LI->getDataLayout();

Expand Down Expand Up @@ -24314,6 +24315,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
}

for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
if (!DIV)
continue;
// We have to create a brand new ExtractValue to replace each
// of these old ExtractValue instructions.
Value *NewEV =
Expand Down Expand Up @@ -24437,17 +24440,15 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
/// dealing with factor of 2 (extractvalue is still required for most of other
/// factors though).
bool RISCVTargetLowering::lowerInterleavedVPLoad(
VPIntrinsic *Load, Value *Mask,
ArrayRef<Value *> DeinterleaveResults) const {
VPIntrinsic *Load, Value *Mask, ArrayRef<Value *> DeinterleaveResults,
unsigned Factor) const {
assert(Mask && "Expect a valid mask");
assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
"Unexpected intrinsic");

const unsigned Factor = DeinterleaveResults.size();

auto *VTy = dyn_cast<VectorType>(DeinterleaveResults[0]->getType());
if (!VTy)
return false;
Value *FirstActive = *llvm::find_if(DeinterleaveResults,
[](Value *V) { return V != nullptr; });
VectorType *VTy = cast<VectorType>(FirstActive->getType());

auto &DL = Load->getModule()->getDataLayout();
Align Alignment = Load->getParamAlign(0).value_or(
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -437,14 +437,16 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool lowerDeinterleaveIntrinsicToLoad(
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
ArrayRef<Value *> DeinterleaveValues,
unsigned Factor) const override;

bool lowerInterleaveIntrinsicToStore(
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;

bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
ArrayRef<Value *> DeinterleaveRes) const override;
ArrayRef<Value *> DeinterleaveRes,
unsigned Factor) const override;

bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;
Expand Down
55 changes: 4 additions & 51 deletions llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_load_nxv16i
define <vscale x 16 x i8> @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8_oneactive:
; CHECK: # %bb.0:
; CHECK-NEXT: vl4r.v v12, (a0)
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v12, 0
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vlseg2e8.v v8, (a0)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, this is a missed optimization which will follow in a separate patch. We should be using the strided load for the single active case. We do this for the shuffle path, but not for the intrinsic path.

; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
Expand All @@ -65,9 +64,8 @@ define <vscale x 16 x i8> @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive(pt
define <vscale x 16 x i8> @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive2(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8_oneactive2:
; CHECK: # %bb.0:
; CHECK-NEXT: vl4r.v v12, (a0)
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v12, 8
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vlseg2e8.v v6, (a0)
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
Expand Down Expand Up @@ -409,23 +407,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; CHECK-NEXT: vl4r.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
Expand All @@ -436,23 +419,8 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; CHECK-NEXT: vl4r.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v5, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
Expand All @@ -463,23 +431,8 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4_twoactive(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4_twoactive:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; CHECK-NEXT: vl4r.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vlseg4e8.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
Expand Down
Loading