Skip to content

Commit 4a66ba2

Browse files
authored
[IA] Support deinterleave intrinsics w/ fewer than N extracts (#147572)
For the fixed vector cases, we already support this, but the deinterleave intrinsic cases (primary used by scalable vectors) didn't. Supporting it requires plumbing through the Factor separately from the extracts, as there can now be fewer extracts than the Factor. Note that the fixed vector path handles this slightly differently - it uses the shuffle and indices scheme to achieve the same thing.
1 parent 76a841a commit 4a66ba2

File tree

7 files changed

+53
-84
lines changed

7 files changed

+53
-84
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3230,7 +3230,8 @@ class LLVM_ABI TargetLoweringBase {
32303230
/// \p Mask is a mask value
32313231
/// \p DeinterleaveRes is a list of deinterleaved results.
32323232
virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
3233-
ArrayRef<Value *> DeinterleaveRes) const {
3233+
ArrayRef<Value *> DeinterleaveRes,
3234+
unsigned Factor) const {
32343235
return false;
32353236
}
32363237

@@ -3253,7 +3254,8 @@ class LLVM_ABI TargetLoweringBase {
32533254
/// \p DeinterleaveValues contains the deinterleaved values.
32543255
virtual bool
32553256
lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
3256-
ArrayRef<Value *> DeinterleaveValues) const {
3257+
ArrayRef<Value *> DeinterleaveValues,
3258+
unsigned Factor) const {
32573259
return false;
32583260
}
32593261

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
380380
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
381381
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
382382
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
383-
if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
383+
if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues, Factor))
384384
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
385385
return !Extracts.empty() || BinOpShuffleChanged;
386386
} else {
@@ -673,9 +673,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
673673
return false;
674674

675675
const unsigned Factor = getIntrinsicFactor(DI);
676-
if (!DI->hasNUses(Factor))
677-
return false;
678-
SmallVector<Value *, 8> DeinterleaveValues(Factor);
676+
SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
677+
Value *LastFactor = nullptr;
679678
for (auto *User : DI->users()) {
680679
auto *Extract = dyn_cast<ExtractValueInst>(User);
681680
if (!Extract || Extract->getNumIndices() != 1)
@@ -684,15 +683,19 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
684683
if (DeinterleaveValues[Idx])
685684
return false;
686685
DeinterleaveValues[Idx] = Extract;
686+
LastFactor = Extract;
687687
}
688688

689+
if (!LastFactor)
690+
return false;
691+
689692
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
690693
if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
691694
return false;
692695
// Check mask operand. Handle both all-true/false and interleaved mask.
693696
Value *WideMask = VPLoad->getOperand(1);
694-
Value *Mask = getMask(WideMask, Factor,
695-
cast<VectorType>(DeinterleaveValues[0]->getType()));
697+
Value *Mask =
698+
getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
696699
if (!Mask)
697700
return false;
698701

@@ -701,7 +704,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
701704

702705
// Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
703706
// TLI function to emit target-specific interleaved instruction.
704-
if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
707+
if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues, Factor))
705708
return false;
706709

707710
} else {
@@ -713,12 +716,13 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
713716
<< " and factor = " << Factor << "\n");
714717

715718
// Try and match this with target specific intrinsics.
716-
if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
719+
if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues, Factor))
717720
return false;
718721
}
719722

720723
for (Value *V : DeinterleaveValues)
721-
DeadInsts.insert(cast<Instruction>(V));
724+
if (V)
725+
DeadInsts.insert(cast<Instruction>(V));
722726
DeadInsts.insert(DI);
723727
// We now have a target-specific load, so delete the old one.
724728
DeadInsts.insert(cast<Instruction>(LoadedVal));

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17456,14 +17456,16 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
1745617456
}
1745717457

1745817458
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17459-
LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
17460-
unsigned Factor = DeinterleavedValues.size();
17459+
LoadInst *LI, ArrayRef<Value *> DeinterleavedValues,
17460+
unsigned Factor) const {
1746117461
if (Factor != 2 && Factor != 4) {
1746217462
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
1746317463
return false;
1746417464
}
1746517465

17466-
VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
17466+
Value *FirstActive = *llvm::find_if(DeinterleavedValues,
17467+
[](Value *V) { return V != nullptr; });
17468+
VectorType *VTy = cast<VectorType>(FirstActive->getType());
1746717469

1746817470
const DataLayout &DL = LI->getModule()->getDataLayout();
1746917471
bool UseScalable;
@@ -17512,8 +17514,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1751217514
LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
1751317515
}
1751417516
// Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17515-
for (unsigned J = 0; J < Factor; ++J)
17516-
DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17517+
for (unsigned J = 0; J < Factor; ++J) {
17518+
if (DeinterleavedValues[J])
17519+
DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17520+
}
1751717521
} else {
1751817522
Value *Result;
1751917523
if (UseScalable)
@@ -17522,8 +17526,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1752217526
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
1752317527
// Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
1752417528
for (unsigned I = 0; I < Factor; I++) {
17525-
Value *NewExtract = Builder.CreateExtractValue(Result, I);
17526-
DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17529+
if (DeinterleavedValues[I]) {
17530+
Value *NewExtract = Builder.CreateExtractValue(Result, I);
17531+
DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17532+
}
1752717533
}
1752817534
}
1752917535
return true;

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,9 @@ class AArch64TargetLowering : public TargetLowering {
218218
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
219219
unsigned Factor) const override;
220220

221-
bool lowerDeinterleaveIntrinsicToLoad(
222-
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
221+
bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
222+
ArrayRef<Value *> DeinterleaveValues,
223+
unsigned Factor) const override;
223224

224225
bool lowerInterleaveIntrinsicToStore(
225226
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24302,15 +24302,16 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
2430224302
}
2430324303

2430424304
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
24305-
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
24306-
unsigned Factor = DeinterleaveValues.size();
24305+
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues, unsigned Factor) const {
2430724306
if (Factor > 8)
2430824307
return false;
2430924308

2431024309
assert(LI->isSimple());
2431124310
IRBuilder<> Builder(LI);
2431224311

24313-
auto *ResVTy = cast<VectorType>(DeinterleaveValues[0]->getType());
24312+
Value *FirstActive =
24313+
*llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
24314+
VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
2431424315

2431524316
const DataLayout &DL = LI->getDataLayout();
2431624317

@@ -24361,6 +24362,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
2436124362
}
2436224363

2436324364
for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
24365+
if (!DIV)
24366+
continue;
2436424367
// We have to create a brand new ExtractValue to replace each
2436524368
// of these old ExtractValue instructions.
2436624369
Value *NewEV =
@@ -24484,17 +24487,15 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
2448424487
/// dealing with factor of 2 (extractvalue is still required for most of other
2448524488
/// factors though).
2448624489
bool RISCVTargetLowering::lowerInterleavedVPLoad(
24487-
VPIntrinsic *Load, Value *Mask,
24488-
ArrayRef<Value *> DeinterleaveResults) const {
24490+
VPIntrinsic *Load, Value *Mask, ArrayRef<Value *> DeinterleaveResults,
24491+
unsigned Factor) const {
2448924492
assert(Mask && "Expect a valid mask");
2449024493
assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
2449124494
"Unexpected intrinsic");
2449224495

24493-
const unsigned Factor = DeinterleaveResults.size();
24494-
24495-
auto *VTy = dyn_cast<VectorType>(DeinterleaveResults[0]->getType());
24496-
if (!VTy)
24497-
return false;
24496+
Value *FirstActive = *llvm::find_if(DeinterleaveResults,
24497+
[](Value *V) { return V != nullptr; });
24498+
VectorType *VTy = cast<VectorType>(FirstActive->getType());
2449824499

2449924500
auto &DL = Load->getModule()->getDataLayout();
2450024501
Align Alignment = Load->getParamAlign(0).value_or(

llvm/lib/Target/RISCV/RISCVISelLowering.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,14 +437,16 @@ class RISCVTargetLowering : public TargetLowering {
437437
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
438438
unsigned Factor) const override;
439439

440-
bool lowerDeinterleaveIntrinsicToLoad(
441-
LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
440+
bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
441+
ArrayRef<Value *> DeinterleaveValues,
442+
unsigned Factor) const override;
442443

443444
bool lowerInterleaveIntrinsicToStore(
444445
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
445446

446447
bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
447-
ArrayRef<Value *> DeinterleaveRes) const override;
448+
ArrayRef<Value *> DeinterleaveRes,
449+
unsigned Factor) const override;
448450

449451
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
450452
ArrayRef<Value *> InterleaveOps) const override;

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll

Lines changed: 4 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,8 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_load_nxv16i
5252
define <vscale x 16 x i8> @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive(ptr %p) {
5353
; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8_oneactive:
5454
; CHECK: # %bb.0:
55-
; CHECK-NEXT: vl4r.v v12, (a0)
56-
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
57-
; CHECK-NEXT: vnsrl.wi v8, v12, 0
55+
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
56+
; CHECK-NEXT: vlseg2e8.v v8, (a0)
5857
; CHECK-NEXT: ret
5958
%vec = load <vscale x 32 x i8>, ptr %p
6059
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
@@ -65,9 +64,8 @@ define <vscale x 16 x i8> @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive(pt
6564
define <vscale x 16 x i8> @vector_deinterleave_load_nxv16i8_nxv32i8_oneactive2(ptr %p) {
6665
; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8_oneactive2:
6766
; CHECK: # %bb.0:
68-
; CHECK-NEXT: vl4r.v v12, (a0)
69-
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
70-
; CHECK-NEXT: vnsrl.wi v8, v12, 8
67+
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
68+
; CHECK-NEXT: vlseg2e8.v v6, (a0)
7169
; CHECK-NEXT: ret
7270
%vec = load <vscale x 32 x i8>, ptr %p
7371
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
@@ -409,23 +407,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
409407
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
410408
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive:
411409
; CHECK: # %bb.0:
412-
; CHECK-NEXT: addi sp, sp, -16
413-
; CHECK-NEXT: .cfi_def_cfa_offset 16
414-
; CHECK-NEXT: csrr a1, vlenb
415-
; CHECK-NEXT: slli a1, a1, 2
416-
; CHECK-NEXT: sub sp, sp, a1
417-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
418-
; CHECK-NEXT: vl4r.v v8, (a0)
419-
; CHECK-NEXT: addi a0, sp, 16
420-
; CHECK-NEXT: vs4r.v v8, (a0)
421410
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
422411
; CHECK-NEXT: vlseg4e8.v v8, (a0)
423-
; CHECK-NEXT: csrr a0, vlenb
424-
; CHECK-NEXT: slli a0, a0, 2
425-
; CHECK-NEXT: add sp, sp, a0
426-
; CHECK-NEXT: .cfi_def_cfa sp, 16
427-
; CHECK-NEXT: addi sp, sp, 16
428-
; CHECK-NEXT: .cfi_def_cfa_offset 0
429412
; CHECK-NEXT: ret
430413
%vec = load <vscale x 32 x i8>, ptr %p
431414
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
@@ -436,23 +419,8 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
436419
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
437420
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2:
438421
; CHECK: # %bb.0:
439-
; CHECK-NEXT: addi sp, sp, -16
440-
; CHECK-NEXT: .cfi_def_cfa_offset 16
441-
; CHECK-NEXT: csrr a1, vlenb
442-
; CHECK-NEXT: slli a1, a1, 2
443-
; CHECK-NEXT: sub sp, sp, a1
444-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
445-
; CHECK-NEXT: vl4r.v v8, (a0)
446-
; CHECK-NEXT: addi a0, sp, 16
447-
; CHECK-NEXT: vs4r.v v8, (a0)
448422
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
449423
; CHECK-NEXT: vlseg4e8.v v5, (a0)
450-
; CHECK-NEXT: csrr a0, vlenb
451-
; CHECK-NEXT: slli a0, a0, 2
452-
; CHECK-NEXT: add sp, sp, a0
453-
; CHECK-NEXT: .cfi_def_cfa sp, 16
454-
; CHECK-NEXT: addi sp, sp, 16
455-
; CHECK-NEXT: .cfi_def_cfa_offset 0
456424
; CHECK-NEXT: ret
457425
%vec = load <vscale x 32 x i8>, ptr %p
458426
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
@@ -463,23 +431,8 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
463431
define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4_twoactive(ptr %p) {
464432
; CHECK-LABEL: vector_deinterleave_load_factor4_twoactive:
465433
; CHECK: # %bb.0:
466-
; CHECK-NEXT: addi sp, sp, -16
467-
; CHECK-NEXT: .cfi_def_cfa_offset 16
468-
; CHECK-NEXT: csrr a1, vlenb
469-
; CHECK-NEXT: slli a1, a1, 2
470-
; CHECK-NEXT: sub sp, sp, a1
471-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
472-
; CHECK-NEXT: vl4r.v v8, (a0)
473-
; CHECK-NEXT: addi a0, sp, 16
474-
; CHECK-NEXT: vs4r.v v8, (a0)
475434
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
476435
; CHECK-NEXT: vlseg4e8.v v8, (a0)
477-
; CHECK-NEXT: csrr a0, vlenb
478-
; CHECK-NEXT: slli a0, a0, 2
479-
; CHECK-NEXT: add sp, sp, a0
480-
; CHECK-NEXT: .cfi_def_cfa sp, 16
481-
; CHECK-NEXT: addi sp, sp, 16
482-
; CHECK-NEXT: .cfi_def_cfa_offset 0
483436
; CHECK-NEXT: ret
484437
%vec = load <vscale x 32 x i8>, ptr %p
485438
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)

0 commit comments

Comments
 (0)