-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[LoongArch] Optimize 128-to-256-bit vector insertion and 256-to-128-bit subvector extraction #146300
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…56-to-128-bit subvector extraction (NFC)
…it subvector extraction
@llvm/pr-subscribers-backend-loongarch Author: hev (heiher) ChangesThis patch replaces stack-based accesses with register moves when converting between 128-bit and 256-bit vectors. A 128-bit subvector extract from, or insert to, the lower half of a 256-bit vector is now treated as a subregister copy that needs no instruction. Patch is 41.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146300.diff 6 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7dae4d30d31be..addb0e056ff9c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -289,6 +289,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, VT, Legal);
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
}
for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
@@ -350,7 +351,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SETCC, VT, Legal);
setOperationAction(ISD::VSELECT, VT, Legal);
@@ -497,6 +499,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
+ case ISD::CONCAT_VECTORS:
+ return lowerCONCAT_VECTORS(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::BITREVERSE:
@@ -2520,6 +2524,72 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
return SDValue();
}
+SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT ResVT = Op.getSimpleValueType();
+ assert(ResVT.is256BitVector() && Op.getNumOperands() == 2);
+
+ unsigned NumOperands = Op.getNumOperands();
+ unsigned NumFreezeUndef = 0;
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ unsigned NonZeros = 0;
+ SmallSet<SDValue, 4> Undefs;
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ SDValue SubVec = Op.getOperand(i);
+ if (SubVec.isUndef())
+ continue;
+ if (ISD::isFreezeUndef(SubVec.getNode())) {
+ // If the freeze(undef) has multiple uses then we must fold to zero.
+ if (SubVec.hasOneUse()) {
+ ++NumFreezeUndef;
+ } else {
+ ++NumZero;
+ Undefs.insert(SubVec);
+ }
+ } else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ ++NumZero;
+ else {
+ assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+ NonZeros |= 1 << i;
+ ++NumNonZero;
+ }
+ }
+
+ // If we have more than 2 non-zeros, build each half separately.
+ if (NumNonZero > 2) {
+ MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
+ ArrayRef<SDUse> Ops = Op->ops();
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+ Ops.slice(0, NumOperands / 2));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+ Ops.slice(NumOperands / 2));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+ }
+
+ // Otherwise, build it up through insert_subvectors.
+ SDValue Vec = NumZero ? DAG.getConstant(0, DL, ResVT)
+ : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
+ : DAG.getUNDEF(ResVT));
+
+ // Replace Undef operands with ZeroVector.
+ for (SDValue U : Undefs)
+ DAG.ReplaceAllUsesWith(U, DAG.getConstant(0, DL, U.getSimpleValueType()));
+
+ MVT SubVT = Op.getOperand(0).getSimpleValueType();
+ unsigned NumSubElems = SubVT.getVectorNumElements();
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ if ((NonZeros & (1 << i)) == 0)
+ continue;
+
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResVT, Vec, Op.getOperand(i),
+ DAG.getVectorIdxConstant(i * NumSubElems, DL));
+ }
+
+ return Vec;
+}
+
SDValue
LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 60dc2b385a75c..6b49a98f3ae46 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -376,6 +376,7 @@ class LoongArchTargetLowering : public TargetLowering {
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ff7b0f2ae3f25..95e9fd49d1c0d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1860,12 +1860,6 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
(XVFTINTRZ_LU_D v4f64:$vj)),
sub_128)>;
-// XVPERMI_Q
-foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
-def : Pat<(vt (concat_vectors LSX128:$vd, LSX128:$vj)),
- (XVPERMI_Q (SUBREG_TO_REG (i64 0), LSX128:$vd, sub_128),
- (SUBREG_TO_REG (i64 0), LSX128:$vj, sub_128), 2)>;
-
// XVABSD_{B/H/W/D}[U]
defm : PatXrXr<abds, "XVABSD">;
defm : PatXrXrU<abdu, "XVABSD">;
@@ -1879,6 +1873,35 @@ def : Pat<(loongarch_xvmskgez (v32i8 LASX256:$vj)), (PseudoXVMSKGEZ_B LASX256:$v
def : Pat<(loongarch_xvmskeqz (v32i8 LASX256:$vj)), (PseudoXVMSKEQZ_B LASX256:$vj)>;
def : Pat<(loongarch_xvmsknez (v32i8 LASX256:$vj)), (PseudoXVMSKNEZ_B LASX256:$vj)>;
+// Subvector tricks
+// Patterns for insert_subvector/extract_subvector
+multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
+ RegisterClass RC, ValueType VT,
+ int hiIdx, SubRegIndex subIdx> {
+ // A 128-bit subvector extract from the first 256-bit vector position is a
+ // subregister copy that needs no instruction. Likewise, a 128-bit subvector
+ // insert to the first 256-bit vector position is a subregister copy that needs
+ // no instruction.
+ def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+ (subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
+ def : Pat<(VT (insert_subvector undef_or_freeze_undef, subRC:$src, (iPTR 0))),
+ (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
+
+ def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR hiIdx))),
+ (subVT (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), RC:$src, 1), subIdx))>;
+ def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR 0))),
+ (VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 48))>;
+ def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR hiIdx))),
+ (VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 2))>;
+}
+
+defm : subvector_subreg_lowering<LSX128, v4i32, LASX256, v8i32, 4, sub_128>;
+defm : subvector_subreg_lowering<LSX128, v4f32, LASX256, v8f32, 4, sub_128>;
+defm : subvector_subreg_lowering<LSX128, v2i64, LASX256, v4i64, 2, sub_128>;
+defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>;
+defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>;
+defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>;
+
} // Predicates = [HasExtLASX]
/// Intrinsic pattern
diff --git a/llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll b/llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll
new file mode 100644
index 0000000000000..231e82a6d53ac
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define <32 x i8> @concat_poison_v32i8_1(<16 x i8> %a) {
+; CHECK-LABEL: concat_poison_v32i8_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <16 x i8> %a, <16 x i8> poison,
+ <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+ i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %1
+}
+
+define <32 x i8> @concat_poison_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: concat_poison_v32i8_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <16 x i8> %b, <16 x i8> poison,
+ <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+ i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %1
+}
+
+define <32 x i8> @concat_vectors_v32i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: concat_vectors_v32i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <16 x i8> %a, <16 x i8> %b,
+ <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+ i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %1
+}
+
+define <16 x i16> @concat_poison_v16i16_1(<8 x i16> %a) {
+; CHECK-LABEL: concat_poison_v16i16_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <8 x i16> %a, <8 x i16> poison,
+ <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %1
+}
+
+define <16 x i16> @concat_poison_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: concat_poison_v16i16_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <8 x i16> %b, <8 x i16> poison,
+ <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %1
+}
+
+define <16 x i16> @concat_vectors_v16i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: concat_vectors_v16i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <8 x i16> %a, <8 x i16> %b,
+ <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %1
+}
+
+define <8 x i32> @concat_poison_v8i32_1(<4 x i32> %a) {
+; CHECK-LABEL: concat_poison_v8i32_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <4 x i32> %a, <4 x i32> poison,
+ <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @concat_poison_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: concat_poison_v8i32_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <4 x i32> %b, <4 x i32> poison,
+ <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @concat_vectors_v8i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: concat_vectors_v8i32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <4 x i32> %a, <4 x i32> %b,
+ <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %1
+}
+
+define <8 x float> @concat_poison_v8f32_1(<4 x float> %a) {
+; CHECK-LABEL: concat_poison_v8f32_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <4 x float> %a, <4 x float> poison,
+ <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %1
+}
+
+define <8 x float> @concat_poison_v8f32_2(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: concat_poison_v8f32_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <4 x float> %b, <4 x float> poison,
+ <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %1
+}
+
+define <8 x float> @concat_vectors_v8f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: concat_vectors_v8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <4 x float> %a, <4 x float> %b,
+ <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %1
+}
+
+define <4 x i64> @concat_poison_v8i64_1(<2 x i64> %a) {
+; CHECK-LABEL: concat_poison_v8i64_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i64> %1
+}
+
+define <4 x i64> @concat_poison_v8i64_2(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: concat_poison_v8i64_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <2 x i64> %b, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i64> %1
+}
+
+define <4 x i64> @concat_vectors_v8i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: concat_vectors_v8i64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <2 x i64> %a, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i64> %1
+}
+
+define <4 x double> @concat_poison_v8f64_1(<2 x double> %a) {
+; CHECK-LABEL: concat_poison_v8f64_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <2 x double> %a, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %1
+}
+
+define <4 x double> @concat_poison_v8f64_2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: concat_poison_v8f64_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <2 x double> %b, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %1
+}
+
+define <4 x double> @concat_vectors_v8f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: concat_vectors_v8f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %1
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/insert-extract-subvector.ll b/llvm/test/CodeGen/LoongArch/lasx/insert-extract-subvector.ll
new file mode 100644
index 0000000000000..7a90afca376db
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/insert-extract-subvector.ll
@@ -0,0 +1,668 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64)
+
+define <8 x i32> @insert_lo128_v8i32_1(<4 x i32> %a) {
+; CHECK-LABEL: insert_lo128_v8i32_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %a, i64 0)
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_hi128_v8i32_1(<4 x i32> %a) {
+; CHECK-LABEL: insert_hi128_v8i32_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %a, i64 4)
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_lo128_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_lo128_v8i32_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vori.b $vr0, $vr1, 0
+; CHECK-NEXT: ret
+entry:
+ %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %b, i64 0)
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_hi128_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_hi128_v8i32_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %b, i64 4)
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_lo128_v8i32_3(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_lo128_v8i32_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT: ret
+entry:
+ %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %a, <4 x i32> %b, i64 0)
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_hi128_v8i32_3(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_hi128_v8i32_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: ret
+entry:
+ %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %a, <4 x i32> %b, i64 4)
+ ret <8 x i32> %1
+}
+
+declare <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float>, <4 x float>, i64)
+
+define <8 x float> @insert_lo128_v8f32_1(<4 x float> %a) {
+; CHECK-LABEL: insert_lo128_v8f32_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT: ret
+entry:
+ %1 = call <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> %a, i64 0)
+ ret <8 x float> %1
+}
+
+define <8 x float> @insert_hi128_v8f32_1(<4 x float> %a) {
+; CHECK-LABEL: insert_hi128_v8f32_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:...
[truncated]
|
def : Pat<(VT (insert_subvector undef_or_freeze_undef, subRC:$src, (iPTR 0))), | ||
(VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>; | ||
|
||
def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR hiIdx))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
xvpermi.q $xd, $xj 1
would change the $xj
register if xj == xd
And many cases show this change.
e.g. tangaac/loong-opt-cov-ts@c0576ea#diff-364b3817b5e53b0b599b12041f8b1377a5d7c6f473fa5d3c75db971e939debb5L2105-R2165
We could use xvpermi.d $xd, $xj, 14
to avoid this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the output operand and an input operand are assigned the same physical register, does that mean the input operand's live range has ended?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This patch replaces stack-based accesses with register moves when converting between 128-bit and 256-bit vectors. A 128-bit subvector extract from, or insert to, the lower half of a 256-bit vector is now treated as a subregister copy that needs no instruction.