Skip to content

[LoongArch] Optimize 128-to-256-bit vector insertion and 256-to-128-bit subvector extraction #146300

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 71 additions & 1 deletion llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, VT, Legal);
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
}
for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
Expand Down Expand Up @@ -350,7 +351,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

setOperationAction(ISD::SETCC, VT, Legal);
setOperationAction(ISD::VSELECT, VT, Legal);
Expand Down Expand Up @@ -497,6 +499,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS:
return lowerCONCAT_VECTORS(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::BITREVERSE:
Expand Down Expand Up @@ -2520,6 +2524,72 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
return SDValue();
}

SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT ResVT = Op.getSimpleValueType();
assert(ResVT.is256BitVector() && Op.getNumOperands() == 2);

unsigned NumOperands = Op.getNumOperands();
unsigned NumFreezeUndef = 0;
unsigned NumZero = 0;
unsigned NumNonZero = 0;
unsigned NonZeros = 0;
SmallSet<SDValue, 4> Undefs;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
continue;
if (ISD::isFreezeUndef(SubVec.getNode())) {
// If the freeze(undef) has multiple uses then we must fold to zero.
if (SubVec.hasOneUse()) {
++NumFreezeUndef;
} else {
++NumZero;
Undefs.insert(SubVec);
}
} else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
++NumZero;
else {
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
NonZeros |= 1 << i;
++NumNonZero;
}
}

// If we have more than 2 non-zeros, build each half separately.
if (NumNonZero > 2) {
MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
Ops.slice(0, NumOperands / 2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
Ops.slice(NumOperands / 2));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
}

// Otherwise, build it up through insert_subvectors.
SDValue Vec = NumZero ? DAG.getConstant(0, DL, ResVT)
: (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
: DAG.getUNDEF(ResVT));

// Replace Undef operands with ZeroVector.
for (SDValue U : Undefs)
DAG.ReplaceAllUsesWith(U, DAG.getConstant(0, DL, U.getSimpleValueType()));

MVT SubVT = Op.getOperand(0).getSimpleValueType();
unsigned NumSubElems = SubVT.getVectorNumElements();
for (unsigned i = 0; i != NumOperands; ++i) {
if ((NonZeros & (1 << i)) == 0)
continue;

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResVT, Vec, Op.getOperand(i),
DAG.getVectorIdxConstant(i * NumSubElems, DL));
}

return Vec;
}

SDValue
LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/LoongArch/LoongArchISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ class LoongArchTargetLowering : public TargetLowering {
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
Expand Down
35 changes: 29 additions & 6 deletions llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1860,12 +1860,6 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
(XVFTINTRZ_LU_D v4f64:$vj)),
sub_128)>;

// XVPERMI_Q
foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
def : Pat<(vt (concat_vectors LSX128:$vd, LSX128:$vj)),
(XVPERMI_Q (SUBREG_TO_REG (i64 0), LSX128:$vd, sub_128),
(SUBREG_TO_REG (i64 0), LSX128:$vj, sub_128), 2)>;

// XVABSD_{B/H/W/D}[U]
defm : PatXrXr<abds, "XVABSD">;
defm : PatXrXrU<abdu, "XVABSD">;
Expand All @@ -1879,6 +1873,35 @@ def : Pat<(loongarch_xvmskgez (v32i8 LASX256:$vj)), (PseudoXVMSKGEZ_B LASX256:$v
def : Pat<(loongarch_xvmskeqz (v32i8 LASX256:$vj)), (PseudoXVMSKEQZ_B LASX256:$vj)>;
def : Pat<(loongarch_xvmsknez (v32i8 LASX256:$vj)), (PseudoXVMSKNEZ_B LASX256:$vj)>;

// Subvector tricks
// Patterns for insert_subvector/extract_subvector
multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
RegisterClass RC, ValueType VT,
int hiIdx, SubRegIndex subIdx> {
// A 128-bit subvector extract from the first 256-bit vector position is a
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
// insert to the first 256-bit vector position is a subregister copy that needs
// no instruction.
def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
(subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
def : Pat<(VT (insert_subvector undef_or_freeze_undef, subRC:$src, (iPTR 0))),
(VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;

def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR hiIdx))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xvpermi.q $xd, $xj 1 would change the $xj register if xj == xd

And many cases show this change.
e.g. tangaac/loong-opt-cov-ts@c0576ea#diff-364b3817b5e53b0b599b12041f8b1377a5d7c6f473fa5d3c75db971e939debb5L2105-R2165

We could use xvpermi.d $xd, $xj, 14 to avoid this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the output operand and an input operand are assigned the same physical register, does that mean the input operand's live range has ended?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

(subVT (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), RC:$src, 1), subIdx))>;
def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR 0))),
(VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 48))>;
def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR hiIdx))),
(VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 2))>;
}

defm : subvector_subreg_lowering<LSX128, v4i32, LASX256, v8i32, 4, sub_128>;
defm : subvector_subreg_lowering<LSX128, v4f32, LASX256, v8f32, 4, sub_128>;
defm : subvector_subreg_lowering<LSX128, v2i64, LASX256, v4i64, 2, sub_128>;
defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>;
defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>;
defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>;

} // Predicates = [HasExtLASX]

/// Intrinsic pattern
Expand Down
218 changes: 218 additions & 0 deletions llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s

define <32 x i8> @concat_poison_v32i8_1(<16 x i8> %a) {
; CHECK-LABEL: concat_poison_v32i8_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <16 x i8> %a, <16 x i8> poison,
<32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %1
}

define <32 x i8> @concat_poison_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: concat_poison_v32i8_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <16 x i8> %b, <16 x i8> poison,
<32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %1
}

define <32 x i8> @concat_vectors_v32i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: concat_vectors_v32i8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
; CHECK-NEXT: ret
entry:
%1 = shufflevector <16 x i8> %a, <16 x i8> %b,
<32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %1
}

define <16 x i16> @concat_poison_v16i16_1(<8 x i16> %a) {
; CHECK-LABEL: concat_poison_v16i16_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <8 x i16> %a, <8 x i16> poison,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %1
}

define <16 x i16> @concat_poison_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: concat_poison_v16i16_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <8 x i16> %b, <8 x i16> poison,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %1
}

define <16 x i16> @concat_vectors_v16i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: concat_vectors_v16i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
; CHECK-NEXT: ret
entry:
%1 = shufflevector <8 x i16> %a, <8 x i16> %b,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %1
}

define <8 x i32> @concat_poison_v8i32_1(<4 x i32> %a) {
; CHECK-LABEL: concat_poison_v8i32_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <4 x i32> %a, <4 x i32> poison,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %1
}

define <8 x i32> @concat_poison_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: concat_poison_v8i32_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <4 x i32> %b, <4 x i32> poison,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %1
}

define <8 x i32> @concat_vectors_v8i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: concat_vectors_v8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
; CHECK-NEXT: ret
entry:
%1 = shufflevector <4 x i32> %a, <4 x i32> %b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %1
}

define <8 x float> @concat_poison_v8f32_1(<4 x float> %a) {
; CHECK-LABEL: concat_poison_v8f32_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <4 x float> %a, <4 x float> poison,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %1
}

define <8 x float> @concat_poison_v8f32_2(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: concat_poison_v8f32_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <4 x float> %b, <4 x float> poison,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %1
}

define <8 x float> @concat_vectors_v8f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: concat_vectors_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
; CHECK-NEXT: ret
entry:
%1 = shufflevector <4 x float> %a, <4 x float> %b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %1
}

define <4 x i64> @concat_poison_v8i64_1(<2 x i64> %a) {
; CHECK-LABEL: concat_poison_v8i64_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <2 x i64> %a, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %1
}

define <4 x i64> @concat_poison_v8i64_2(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: concat_poison_v8i64_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <2 x i64> %b, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %1
}

define <4 x i64> @concat_vectors_v8i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: concat_vectors_v8i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
; CHECK-NEXT: ret
entry:
%1 = shufflevector <2 x i64> %a, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %1
}

define <4 x double> @concat_poison_v8f64_1(<2 x double> %a) {
; CHECK-LABEL: concat_poison_v8f64_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <2 x double> %a, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %1
}

define <4 x double> @concat_poison_v8f64_2(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: concat_poison_v8f64_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ret
entry:
%1 = shufflevector <2 x double> %b, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %1
}

define <4 x double> @concat_vectors_v8f64(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: concat_vectors_v8f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
; CHECK-NEXT: ret
entry:
%1 = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %1
}
Loading
Loading