Skip to content

Commit eb0d61a

Browse files
authored
[LoongArch] Optimize 128-to-256-bit vector insertion and 256-to-128-bit subvector extraction (#146300)
This patch replaces stack-based accesses with register moves when converting between 128-bit and 256-bit vectors. A 128-bit subvector extract from, or insert to, the lower half of a 256-bit vector is now treated as a subregister copy that needs no instruction. Fixes #147769
1 parent a510e75 commit eb0d61a

File tree

6 files changed

+205
-933
lines changed

6 files changed

+205
-933
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
291291
setOperationAction(ISD::SETCC, VT, Legal);
292292
setOperationAction(ISD::VSELECT, VT, Legal);
293293
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
294+
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
294295
}
295296
for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
296297
setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
@@ -352,7 +353,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
352353
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
353354
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
354355
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
355-
setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
356+
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
357+
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
356358

357359
setOperationAction(ISD::SETCC, VT, Legal);
358360
setOperationAction(ISD::VSELECT, VT, Legal);
@@ -499,6 +501,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
499501
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
500502
case ISD::BUILD_VECTOR:
501503
return lowerBUILD_VECTOR(Op, DAG);
504+
case ISD::CONCAT_VECTORS:
505+
return lowerCONCAT_VECTORS(Op, DAG);
502506
case ISD::VECTOR_SHUFFLE:
503507
return lowerVECTOR_SHUFFLE(Op, DAG);
504508
case ISD::BITREVERSE:
@@ -2522,6 +2526,72 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
25222526
return SDValue();
25232527
}
25242528

2529+
SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
2530+
SelectionDAG &DAG) const {
2531+
SDLoc DL(Op);
2532+
MVT ResVT = Op.getSimpleValueType();
2533+
assert(ResVT.is256BitVector() && Op.getNumOperands() == 2);
2534+
2535+
unsigned NumOperands = Op.getNumOperands();
2536+
unsigned NumFreezeUndef = 0;
2537+
unsigned NumZero = 0;
2538+
unsigned NumNonZero = 0;
2539+
unsigned NonZeros = 0;
2540+
SmallSet<SDValue, 4> Undefs;
2541+
for (unsigned i = 0; i != NumOperands; ++i) {
2542+
SDValue SubVec = Op.getOperand(i);
2543+
if (SubVec.isUndef())
2544+
continue;
2545+
if (ISD::isFreezeUndef(SubVec.getNode())) {
2546+
// If the freeze(undef) has multiple uses then we must fold to zero.
2547+
if (SubVec.hasOneUse()) {
2548+
++NumFreezeUndef;
2549+
} else {
2550+
++NumZero;
2551+
Undefs.insert(SubVec);
2552+
}
2553+
} else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
2554+
++NumZero;
2555+
else {
2556+
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
2557+
NonZeros |= 1 << i;
2558+
++NumNonZero;
2559+
}
2560+
}
2561+
2562+
// If we have more than 2 non-zeros, build each half separately.
2563+
if (NumNonZero > 2) {
2564+
MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
2565+
ArrayRef<SDUse> Ops = Op->ops();
2566+
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
2567+
Ops.slice(0, NumOperands / 2));
2568+
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
2569+
Ops.slice(NumOperands / 2));
2570+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
2571+
}
2572+
2573+
// Otherwise, build it up through insert_subvectors.
2574+
SDValue Vec = NumZero ? DAG.getConstant(0, DL, ResVT)
2575+
: (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
2576+
: DAG.getUNDEF(ResVT));
2577+
2578+
// Replace Undef operands with ZeroVector.
2579+
for (SDValue U : Undefs)
2580+
DAG.ReplaceAllUsesWith(U, DAG.getConstant(0, DL, U.getSimpleValueType()));
2581+
2582+
MVT SubVT = Op.getOperand(0).getSimpleValueType();
2583+
unsigned NumSubElems = SubVT.getVectorNumElements();
2584+
for (unsigned i = 0; i != NumOperands; ++i) {
2585+
if ((NonZeros & (1 << i)) == 0)
2586+
continue;
2587+
2588+
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResVT, Vec, Op.getOperand(i),
2589+
DAG.getVectorIdxConstant(i * NumSubElems, DL));
2590+
}
2591+
2592+
return Vec;
2593+
}
2594+
25252595
SDValue
25262596
LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
25272597
SelectionDAG &DAG) const {

llvm/lib/Target/LoongArch/LoongArchISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ class LoongArchTargetLowering : public TargetLowering {
376376
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
377377
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
378378
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
379+
SDValue lowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
379380
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
380381
SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const;
381382
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1860,12 +1860,6 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
18601860
(XVFTINTRZ_LU_D v4f64:$vj)),
18611861
sub_128)>;
18621862

1863-
// XVPERMI_Q
1864-
foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
1865-
def : Pat<(vt (concat_vectors LSX128:$vd, LSX128:$vj)),
1866-
(XVPERMI_Q (SUBREG_TO_REG (i64 0), LSX128:$vd, sub_128),
1867-
(SUBREG_TO_REG (i64 0), LSX128:$vj, sub_128), 2)>;
1868-
18691863
// XVABSD_{B/H/W/D}[U]
18701864
defm : PatXrXr<abds, "XVABSD">;
18711865
defm : PatXrXrU<abdu, "XVABSD">;
@@ -1879,6 +1873,35 @@ def : Pat<(loongarch_xvmskgez (v32i8 LASX256:$vj)), (PseudoXVMSKGEZ_B LASX256:$v
18791873
def : Pat<(loongarch_xvmskeqz (v32i8 LASX256:$vj)), (PseudoXVMSKEQZ_B LASX256:$vj)>;
18801874
def : Pat<(loongarch_xvmsknez (v32i8 LASX256:$vj)), (PseudoXVMSKNEZ_B LASX256:$vj)>;
18811875

1876+
// Subvector tricks
1877+
// Patterns for insert_subvector/extract_subvector
1878+
multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
1879+
RegisterClass RC, ValueType VT,
1880+
int hiIdx, SubRegIndex subIdx> {
1881+
// A 128-bit subvector extract from the first 256-bit vector position is a
1882+
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
1883+
// insert to the first 256-bit vector position is a subregister copy that needs
1884+
// no instruction.
1885+
def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
1886+
(subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
1887+
def : Pat<(VT (insert_subvector undef_or_freeze_undef, subRC:$src, (iPTR 0))),
1888+
(VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
1889+
1890+
def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR hiIdx))),
1891+
(subVT (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), RC:$src, 1), subIdx))>;
1892+
def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR 0))),
1893+
(VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 48))>;
1894+
def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR hiIdx))),
1895+
(VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 2))>;
1896+
}
1897+
1898+
defm : subvector_subreg_lowering<LSX128, v4i32, LASX256, v8i32, 4, sub_128>;
1899+
defm : subvector_subreg_lowering<LSX128, v4f32, LASX256, v8f32, 4, sub_128>;
1900+
defm : subvector_subreg_lowering<LSX128, v2i64, LASX256, v4i64, 2, sub_128>;
1901+
defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>;
1902+
defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>;
1903+
defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>;
1904+
18821905
} // Predicates = [HasExtLASX]
18831906

18841907
/// Intrinsic pattern

llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ define <32 x i8> @concat_poison_v32i8_1(<16 x i8> %a) {
55
; CHECK-LABEL: concat_poison_v32i8_1:
66
; CHECK: # %bb.0: # %entry
77
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
8-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
98
; CHECK-NEXT: ret
109
entry:
1110
%1 = shufflevector <16 x i8> %a, <16 x i8> poison,
@@ -20,7 +19,6 @@ define <32 x i8> @concat_poison_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
2019
; CHECK-LABEL: concat_poison_v32i8_2:
2120
; CHECK: # %bb.0: # %entry
2221
; CHECK-NEXT: vori.b $vr0, $vr1, 0
23-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
2422
; CHECK-NEXT: ret
2523
entry:
2624
%1 = shufflevector <16 x i8> %b, <16 x i8> poison,
@@ -51,7 +49,6 @@ define <16 x i16> @concat_poison_v16i16_1(<8 x i16> %a) {
5149
; CHECK-LABEL: concat_poison_v16i16_1:
5250
; CHECK: # %bb.0: # %entry
5351
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
54-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
5552
; CHECK-NEXT: ret
5653
entry:
5754
%1 = shufflevector <8 x i16> %a, <8 x i16> poison,
@@ -64,7 +61,6 @@ define <16 x i16> @concat_poison_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
6461
; CHECK-LABEL: concat_poison_v16i16_2:
6562
; CHECK: # %bb.0: # %entry
6663
; CHECK-NEXT: vori.b $vr0, $vr1, 0
67-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
6864
; CHECK-NEXT: ret
6965
entry:
7066
%1 = shufflevector <8 x i16> %b, <8 x i16> poison,
@@ -91,7 +87,6 @@ define <8 x i32> @concat_poison_v8i32_1(<4 x i32> %a) {
9187
; CHECK-LABEL: concat_poison_v8i32_1:
9288
; CHECK: # %bb.0: # %entry
9389
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
94-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
9590
; CHECK-NEXT: ret
9691
entry:
9792
%1 = shufflevector <4 x i32> %a, <4 x i32> poison,
@@ -103,7 +98,6 @@ define <8 x i32> @concat_poison_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
10398
; CHECK-LABEL: concat_poison_v8i32_2:
10499
; CHECK: # %bb.0: # %entry
105100
; CHECK-NEXT: vori.b $vr0, $vr1, 0
106-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
107101
; CHECK-NEXT: ret
108102
entry:
109103
%1 = shufflevector <4 x i32> %b, <4 x i32> poison,
@@ -128,7 +122,6 @@ define <8 x float> @concat_poison_v8f32_1(<4 x float> %a) {
128122
; CHECK-LABEL: concat_poison_v8f32_1:
129123
; CHECK: # %bb.0: # %entry
130124
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
131-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
132125
; CHECK-NEXT: ret
133126
entry:
134127
%1 = shufflevector <4 x float> %a, <4 x float> poison,
@@ -140,7 +133,6 @@ define <8 x float> @concat_poison_v8f32_2(<4 x float> %a, <4 x float> %b) {
140133
; CHECK-LABEL: concat_poison_v8f32_2:
141134
; CHECK: # %bb.0: # %entry
142135
; CHECK-NEXT: vori.b $vr0, $vr1, 0
143-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
144136
; CHECK-NEXT: ret
145137
entry:
146138
%1 = shufflevector <4 x float> %b, <4 x float> poison,
@@ -165,7 +157,6 @@ define <4 x i64> @concat_poison_v8i64_1(<2 x i64> %a) {
165157
; CHECK-LABEL: concat_poison_v8i64_1:
166158
; CHECK: # %bb.0: # %entry
167159
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
168-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
169160
; CHECK-NEXT: ret
170161
entry:
171162
%1 = shufflevector <2 x i64> %a, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -176,7 +167,6 @@ define <4 x i64> @concat_poison_v8i64_2(<2 x i64> %a, <2 x i64> %b) {
176167
; CHECK-LABEL: concat_poison_v8i64_2:
177168
; CHECK: # %bb.0: # %entry
178169
; CHECK-NEXT: vori.b $vr0, $vr1, 0
179-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
180170
; CHECK-NEXT: ret
181171
entry:
182172
%1 = shufflevector <2 x i64> %b, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -199,7 +189,6 @@ define <4 x double> @concat_poison_v8f64_1(<2 x double> %a) {
199189
; CHECK-LABEL: concat_poison_v8f64_1:
200190
; CHECK: # %bb.0: # %entry
201191
; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0
202-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
203192
; CHECK-NEXT: ret
204193
entry:
205194
%1 = shufflevector <2 x double> %a, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -210,7 +199,6 @@ define <4 x double> @concat_poison_v8f64_2(<2 x double> %a, <2 x double> %b) {
210199
; CHECK-LABEL: concat_poison_v8f64_2:
211200
; CHECK: # %bb.0: # %entry
212201
; CHECK-NEXT: vori.b $vr0, $vr1, 0
213-
; CHECK-NEXT: xvpermi.q $xr0, $xr0, 2
214202
; CHECK-NEXT: ret
215203
entry:
216204
%1 = shufflevector <2 x double> %b, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

0 commit comments

Comments
 (0)