Skip to content

Commit e3bd1f2

Browse files
authored
[LoongArch] lower vector shuffle to zero or any extend (#129485)
1 parent 174110b commit e3bd1f2

File tree

8 files changed

+437
-486
lines changed

8 files changed

+437
-486
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,132 @@ fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
542542
return true;
543543
}
544544

545+
/// Compute whether each element of a shuffle is zeroable.
546+
///
547+
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
548+
static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
549+
SDValue V2, APInt &KnownUndef,
550+
APInt &KnownZero) {
551+
int Size = Mask.size();
552+
KnownUndef = KnownZero = APInt::getZero(Size);
553+
554+
V1 = peekThroughBitcasts(V1);
555+
V2 = peekThroughBitcasts(V2);
556+
557+
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
558+
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
559+
560+
int VectorSizeInBits = V1.getValueSizeInBits();
561+
int ScalarSizeInBits = VectorSizeInBits / Size;
562+
assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
563+
564+
for (int i = 0; i < Size; ++i) {
565+
int M = Mask[i];
566+
if (M < 0) {
567+
KnownUndef.setBit(i);
568+
continue;
569+
}
570+
if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
571+
KnownZero.setBit(i);
572+
continue;
573+
}
574+
}
575+
}
576+
577+
/// Lower VECTOR_SHUFFLE as ZERO_EXTEND Or ANY_EXTEND (if possible).
578+
///
579+
/// For example:
580+
/// %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
581+
/// <4 x i32> <i32 0, i32 4, i32 1, i32 4>
582+
/// %3 = bitcast <4 x i32> %2 to <2 x i64>
583+
/// is lowered to:
584+
/// (VREPLI $v1, 0)
585+
/// (VILVL $v0, $v1, $v0)
586+
static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
587+
ArrayRef<int> Mask, MVT VT,
588+
SDValue V1, SDValue V2,
589+
SelectionDAG &DAG) {
590+
int Bits = VT.getSizeInBits();
591+
int EltBits = VT.getScalarSizeInBits();
592+
int NumElements = VT.getVectorNumElements();
593+
594+
APInt KnownUndef, KnownZero;
595+
computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
596+
APInt Zeroable = KnownUndef | KnownZero;
597+
if (Zeroable.isAllOnes())
598+
return DAG.getConstant(0, DL, VT);
599+
600+
// Define a helper function to check a particular ext-scale and lower to it if
601+
// valid.
602+
auto Lower = [&](int Scale) -> SDValue {
603+
SDValue InputV;
604+
bool AnyExt = true;
605+
int Offset = 0;
606+
for (int i = 0; i < NumElements; i++) {
607+
int M = Mask[i];
608+
if (M < 0)
609+
continue;
610+
if (i % Scale != 0) {
611+
// Each of the extended elements need to be zeroable.
612+
if (!Zeroable[i])
613+
return SDValue();
614+
615+
AnyExt = false;
616+
continue;
617+
}
618+
619+
// Each of the base elements needs to be consecutive indices into the
620+
// same input vector.
621+
SDValue V = M < NumElements ? V1 : V2;
622+
M = M % NumElements;
623+
if (!InputV) {
624+
InputV = V;
625+
Offset = M - (i / Scale);
626+
627+
// These offset can't be handled
628+
if (Offset % (NumElements / Scale))
629+
return SDValue();
630+
} else if (InputV != V)
631+
return SDValue();
632+
633+
if (M != (Offset + (i / Scale)))
634+
return SDValue(); // Non-consecutive strided elements.
635+
}
636+
637+
// If we fail to find an input, we have a zero-shuffle which should always
638+
// have already been handled.
639+
if (!InputV)
640+
return SDValue();
641+
642+
do {
643+
unsigned VilVLoHi = LoongArchISD::VILVL;
644+
if (Offset >= (NumElements / 2)) {
645+
VilVLoHi = LoongArchISD::VILVH;
646+
Offset -= (NumElements / 2);
647+
}
648+
649+
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
650+
SDValue Ext =
651+
AnyExt ? DAG.getFreeze(InputV) : DAG.getConstant(0, DL, InputVT);
652+
InputV = DAG.getBitcast(InputVT, InputV);
653+
InputV = DAG.getNode(VilVLoHi, DL, InputVT, Ext, InputV);
654+
Scale /= 2;
655+
EltBits *= 2;
656+
NumElements /= 2;
657+
} while (Scale > 1);
658+
return DAG.getBitcast(VT, InputV);
659+
};
660+
661+
// Each iteration, try extending the elements half as much, but into twice as
662+
// many elements.
663+
for (int NumExtElements = Bits / 64; NumExtElements < NumElements;
664+
NumExtElements *= 2) {
665+
if (SDValue V = Lower(NumElements / NumExtElements))
666+
return V;
667+
}
668+
return SDValue();
669+
}
670+
545671
/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
546672
///
547673
/// VREPLVEI performs vector broadcast based on an element specified by an
@@ -956,6 +1082,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
9561082
return Result;
9571083
if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
9581084
return Result;
1085+
if ((Result =
1086+
lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
1087+
return Result;
9591088
if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
9601089
return Result;
9611090

llvm/lib/Target/LoongArch/LoongArchISelLowering.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,14 @@ class LoongArchTargetLowering : public TargetLowering {
271271
unsigned *Fast = nullptr) const override;
272272

273273
bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override {
274-
return false;
274+
if (!VT.isSimple())
275+
return false;
276+
277+
// Not for i1 vectors
278+
if (VT.getSimpleVT().getScalarType() == MVT::i1)
279+
return false;
280+
281+
return isTypeLegal(VT.getSimpleVT());
275282
}
276283
bool shouldConsiderGEPOffsetSplit() const override { return true; }
277284
bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;

llvm/test/CodeGen/LoongArch/lsx/build-vector.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,11 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
374374
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
375375
; CHECK: # %bb.0:
376376
; CHECK-NEXT: vld $vr0, $a0, 0
377-
; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 1
378-
; CHECK-NEXT: bstrpick.d $a0, $a0, 31, 0
379-
; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
380-
; CHECK-NEXT: vst $vr0, $a1, 0
377+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI24_0)
378+
; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI24_0)
379+
; CHECK-NEXT: vrepli.b $vr2, 0
380+
; CHECK-NEXT: vshuf.w $vr1, $vr2, $vr0
381+
; CHECK-NEXT: vst $vr1, $a1, 0
381382
; CHECK-NEXT: ret
382383
%v = load volatile <4 x i32>, ptr %src
383384
%e = extractelement <4 x i32> %v, i32 1

0 commit comments

Comments
 (0)