Skip to content

Commit 24bb180

Browse files
authored
[RISCV] Attempt to widen SEW before generic shuffle lowering (#122311)
This takes inspiration from AArch64 which does the same thing to assist with zip/trn/etc.. Doing this recursion unconditionally when the mask allows is slightly questionable, but seems to work out okay in practice. As a bit of context, it's helpful to realize that we have existing logic in both DAGCombine and InstCombine which mutates the element width of in an analogous manner. However, that code has two restriction which prevent it from handling the motivating cases here. First, it only triggers if there is a bitcast involving a different element type. Second, the matcher used considers a partially undef wide element to be a non-match. I considered trying to relax those assumptions, but the information loss for undef in mid-level opt seemed more likely to open a can of worms than I wanted.
1 parent 9c85cde commit 24bb180

File tree

8 files changed

+388
-478
lines changed

8 files changed

+388
-478
lines changed

llvm/include/llvm/Analysis/VectorUtils.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,17 @@ void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
235235
bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
236236
SmallVectorImpl<int> &ScaledMask);
237237

238+
/// A variant of the previous method which is specialized for Scale=2, and
239+
/// treats -1 as undef and allows widening when a wider element is partially
240+
/// undef in the narrow form of the mask. This transformation discards
241+
/// information about which bytes in the original shuffle were undef.
242+
bool widenShuffleMaskElts(ArrayRef<int> M, SmallVectorImpl<int> &NewMask);
243+
238244
/// Attempt to narrow/widen the \p Mask shuffle mask to the \p NumDstElts target
239245
/// width. Internally this will call narrowShuffleMaskElts/widenShuffleMaskElts.
240-
/// This will assert unless NumDstElts is a multiple of Mask.size (or vice-versa).
241-
/// Returns false on failure, and ScaledMask will be in an undefined state.
246+
/// This will assert unless NumDstElts is a multiple of Mask.size (or
247+
/// vice-versa). Returns false on failure, and ScaledMask will be in an
248+
/// undefined state.
242249
bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
243250
SmallVectorImpl<int> &ScaledMask);
244251

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,41 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
479479
return true;
480480
}
481481

482+
bool llvm::widenShuffleMaskElts(ArrayRef<int> M,
483+
SmallVectorImpl<int> &NewMask) {
484+
unsigned NumElts = M.size();
485+
if (NumElts % 2 != 0)
486+
return false;
487+
488+
NewMask.clear();
489+
for (unsigned i = 0; i < NumElts; i += 2) {
490+
int M0 = M[i];
491+
int M1 = M[i + 1];
492+
493+
// If both elements are undef, new mask is undef too.
494+
if (M0 == -1 && M1 == -1) {
495+
NewMask.push_back(-1);
496+
continue;
497+
}
498+
499+
if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
500+
NewMask.push_back(M1 / 2);
501+
continue;
502+
}
503+
504+
if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
505+
NewMask.push_back(M0 / 2);
506+
continue;
507+
}
508+
509+
NewMask.clear();
510+
return false;
511+
}
512+
513+
assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
514+
return true;
515+
}
516+
482517
bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
483518
SmallVectorImpl<int> &ScaledMask) {
484519
unsigned NumSrcElts = Mask.size();

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13723,44 +13723,6 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
1372313723
return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
1372413724
}
1372513725

13726-
// Return true if we can get a new shuffle mask by checking the parameter mask
13727-
// array to test whether every two adjacent mask values are continuous and
13728-
// starting from an even number.
13729-
static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
13730-
SmallVectorImpl<int> &NewMask) {
13731-
unsigned NumElts = VT.getVectorNumElements();
13732-
if (NumElts % 2 != 0)
13733-
return false;
13734-
13735-
NewMask.clear();
13736-
for (unsigned i = 0; i < NumElts; i += 2) {
13737-
int M0 = M[i];
13738-
int M1 = M[i + 1];
13739-
13740-
// If both elements are undef, new mask is undef too.
13741-
if (M0 == -1 && M1 == -1) {
13742-
NewMask.push_back(-1);
13743-
continue;
13744-
}
13745-
13746-
if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
13747-
NewMask.push_back(M1 / 2);
13748-
continue;
13749-
}
13750-
13751-
if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
13752-
NewMask.push_back(M0 / 2);
13753-
continue;
13754-
}
13755-
13756-
NewMask.clear();
13757-
return false;
13758-
}
13759-
13760-
assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
13761-
return true;
13762-
}
13763-
1376413726
// Try to widen element type to get a new mask value for a better permutation
1376513727
// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
1376613728
// UZP1/2, TRN1/2, REV, INS, etc.
@@ -13787,7 +13749,7 @@ static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
1378713749
return SDValue();
1378813750

1378913751
SmallVector<int, 8> NewMask;
13790-
if (isWideTypeMask(Mask, VT, NewMask)) {
13752+
if (widenShuffleMaskElts(Mask, NewMask)) {
1379113753
MVT NewEltVT = VT.isFloatingPoint()
1379213754
? MVT::getFloatingPointVT(ElementSize * 2)
1379313755
: MVT::getIntegerVT(ElementSize * 2);

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5261,6 +5261,39 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
52615261
return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
52625262
}
52635263

5264+
/// Try to widen element type to get a new mask value for a better permutation
5265+
/// sequence. This doesn't try to inspect the widened mask for profitability;
5266+
/// we speculate the widened form is equal or better. This has the effect of
5267+
/// reducing mask constant sizes - allowing cheaper materialization sequences
5268+
/// - and index sequence sizes - reducing register pressure and materialization
5269+
/// cost, at the cost of (possibly) an extra VTYPE toggle.
5270+
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
5271+
SDLoc DL(Op);
5272+
MVT VT = Op.getSimpleValueType();
5273+
MVT ScalarVT = VT.getVectorElementType();
5274+
unsigned ElementSize = ScalarVT.getFixedSizeInBits();
5275+
SDValue V0 = Op.getOperand(0);
5276+
SDValue V1 = Op.getOperand(1);
5277+
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
5278+
5279+
// Avoid wasted work leading to isTypeLegal check failing below
5280+
if (ElementSize > 32)
5281+
return SDValue();
5282+
5283+
SmallVector<int, 8> NewMask;
5284+
if (!widenShuffleMaskElts(Mask, NewMask))
5285+
return SDValue();
5286+
5287+
MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(ElementSize * 2)
5288+
: MVT::getIntegerVT(ElementSize * 2);
5289+
MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
5290+
if (!DAG.getTargetLoweringInfo().isTypeLegal(NewVT))
5291+
return SDValue();
5292+
V0 = DAG.getBitcast(NewVT, V0);
5293+
V1 = DAG.getBitcast(NewVT, V1);
5294+
return DAG.getBitcast(VT, DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
5295+
}
5296+
52645297
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
52655298
const RISCVSubtarget &Subtarget) {
52665299
SDValue V1 = Op.getOperand(0);
@@ -5506,6 +5539,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55065539
if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
55075540
return V;
55085541

5542+
// Before hitting generic lowering fallbacks, try to widen the mask
5543+
// to a wider SEW.
5544+
if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
5545+
return V;
5546+
55095547
// Can we generate a vcompress instead of a vrgather? These scale better
55105548
// at high LMUL, at the cost of not being able to fold a following select
55115549
// into them. The mask constants are also smaller than the index vector
@@ -5615,6 +5653,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
56155653
if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
56165654
return V;
56175655

5656+
// Before hitting generic lowering fallbacks, try to widen the mask
5657+
// to a wider SEW.
5658+
if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
5659+
return V;
5660+
56185661
// Try to pick a profitable operand order.
56195662
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
56205663
SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 17 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -603,10 +603,8 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
603603
define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
604604
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
605605
; CHECK: # %bb.0:
606-
; CHECK-NEXT: li a0, -32
607-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
608-
; CHECK-NEXT: vmv.s.x v0, a0
609-
; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
606+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
607+
; CHECK-NEXT: vslideup.vi v8, v9, 2
610608
; CHECK-NEXT: ret
611609
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
612610
ret <8 x i8> %res
@@ -704,8 +702,8 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
704702
; CHECK-LABEL: shuffle_v8i32_2:
705703
; CHECK: # %bb.0:
706704
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
707-
; CHECK-NEXT: vmv.v.i v0, -13
708-
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
705+
; CHECK-NEXT: vmv.v.i v0, 13
706+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
709707
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
710708
; CHECK-NEXT: ret
711709
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
@@ -756,9 +754,9 @@ define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) {
756754
define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) {
757755
; CHECK-LABEL: shuffle_compress_singlesrc_e32:
758756
; CHECK: # %bb.0:
759-
; CHECK-NEXT: li a0, 115
760-
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
761-
; CHECK-NEXT: vmv.s.x v12, a0
757+
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
758+
; CHECK-NEXT: vmv.v.i v12, 13
759+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
762760
; CHECK-NEXT: vcompress.vm v10, v8, v12
763761
; CHECK-NEXT: vmv.v.v v8, v10
764762
; CHECK-NEXT: ret
@@ -832,26 +830,16 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32_index2(<8 x i32> %v) {
832830
}
833831

834832
define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) {
835-
; RV32-LABEL: shuffle_spread3_singlesrc_e32:
836-
; RV32: # %bb.0:
837-
; RV32-NEXT: lui a0, %hi(.LCPI57_0)
838-
; RV32-NEXT: addi a0, a0, %lo(.LCPI57_0)
839-
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
840-
; RV32-NEXT: vle16.v v12, (a0)
841-
; RV32-NEXT: vrgatherei16.vv v10, v8, v12
842-
; RV32-NEXT: vmv.v.v v8, v10
843-
; RV32-NEXT: ret
844-
;
845-
; RV64-LABEL: shuffle_spread3_singlesrc_e32:
846-
; RV64: # %bb.0:
847-
; RV64-NEXT: lui a0, 32769
848-
; RV64-NEXT: slli a0, a0, 21
849-
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
850-
; RV64-NEXT: vmv.v.x v12, a0
851-
; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
852-
; RV64-NEXT: vrgatherei16.vv v10, v8, v12
853-
; RV64-NEXT: vmv.v.v v8, v10
854-
; RV64-NEXT: ret
833+
; CHECK-LABEL: shuffle_spread3_singlesrc_e32:
834+
; CHECK: # %bb.0:
835+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
836+
; CHECK-NEXT: vmv.v.i v10, 0
837+
; CHECK-NEXT: li a0, 1
838+
; CHECK-NEXT: vslide1down.vx v12, v10, a0
839+
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
840+
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
841+
; CHECK-NEXT: vmv.v.v v8, v10
842+
; CHECK-NEXT: ret
855843
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 2, i32 undef>
856844
ret <8 x i32> %out
857845
}

0 commit comments

Comments
 (0)