Skip to content

Commit f5056c8

Browse files
committed
[AArch64] Improve shuffle vector by using wider types
Try to widen element type to get a new mask value for a better permutation sequence, so that we can use NEON shuffle instructions, such as zip1/2, UZP1/2, TRN1/2, REV, INS, etc. For example: shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3> is equivalent to: shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> Finally, we can get: mov v0.d[0], v1.d[1] Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D111619
1 parent 1ef6bd9 commit f5056c8

File tree

4 files changed

+264
-4
lines changed

4 files changed

+264
-4
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9577,6 +9577,86 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
95779577
return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
95789578
}
95799579

9580+
// Return true if we can get a new shuffle mask by checking the parameter mask
9581+
// array to test whether every two adjacent mask values are continuous and
9582+
// starting from an even number.
9583+
static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
9584+
SmallVectorImpl<int> &NewMask) {
9585+
unsigned NumElts = VT.getVectorNumElements();
9586+
if (NumElts % 2 != 0)
9587+
return false;
9588+
9589+
NewMask.clear();
9590+
for (unsigned i = 0; i < NumElts; i += 2) {
9591+
int M0 = M[i];
9592+
int M1 = M[i + 1];
9593+
9594+
// If both elements are undef, new mask is undef too.
9595+
if (M0 == -1 && M1 == -1) {
9596+
NewMask.push_back(-1);
9597+
continue;
9598+
}
9599+
9600+
if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
9601+
NewMask.push_back(M1 / 2);
9602+
continue;
9603+
}
9604+
9605+
if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
9606+
NewMask.push_back(M0 / 2);
9607+
continue;
9608+
}
9609+
9610+
NewMask.clear();
9611+
return false;
9612+
}
9613+
9614+
assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
9615+
return true;
9616+
}
9617+
9618+
// Try to widen element type to get a new mask value for a better permutation
9619+
// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
9620+
// UZP1/2, TRN1/2, REV, INS, etc.
9621+
// For example:
9622+
// shufflevector <4 x i32> %a, <4 x i32> %b,
9623+
// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
9624+
// is equivalent to:
9625+
// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
9626+
// Finally, we can get:
9627+
// mov v0.d[0], v1.d[1]
9628+
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
9629+
SDLoc DL(Op);
9630+
EVT VT = Op.getValueType();
9631+
EVT ScalarVT = VT.getVectorElementType();
9632+
unsigned ElementSize = ScalarVT.getFixedSizeInBits();
9633+
SDValue V0 = Op.getOperand(0);
9634+
SDValue V1 = Op.getOperand(1);
9635+
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
9636+
9637+
// If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
9638+
// We need to make sure the wider element type is legal. Thus, ElementSize
9639+
// should be not larger than 32 bits, and i1 type should also be excluded.
9640+
if (ElementSize > 32 || ElementSize == 1)
9641+
return SDValue();
9642+
9643+
SmallVector<int, 8> NewMask;
9644+
if (isWideTypeMask(Mask, VT, NewMask)) {
9645+
MVT NewEltVT = VT.isFloatingPoint()
9646+
? MVT::getFloatingPointVT(ElementSize * 2)
9647+
: MVT::getIntegerVT(ElementSize * 2);
9648+
MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
9649+
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
9650+
V0 = DAG.getBitcast(NewVT, V0);
9651+
V1 = DAG.getBitcast(NewVT, V1);
9652+
return DAG.getBitcast(VT,
9653+
DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
9654+
}
9655+
}
9656+
9657+
return SDValue();
9658+
}
9659+
95809660
SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
95819661
SelectionDAG &DAG) const {
95829662
SDLoc dl(Op);
@@ -9724,6 +9804,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
97249804
DstLaneV);
97259805
}
97269806

9807+
if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
9808+
return NewSD;
9809+
97279810
// If the shuffle is not directly supported and it has 4 elements, use
97289811
// the PerfectShuffle-generated table to synthesize it from other shuffles.
97299812
unsigned NumElts = VT.getVectorNumElements();

llvm/test/CodeGen/AArch64/concat-vector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,7 @@ define <8 x i32> @concat8(<4 x i32>* %A, <4 x i32>* %B) {
8888
define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) {
8989
; CHECK-LABEL: concat9:
9090
; CHECK: // %bb.0:
91-
; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #4
92-
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4
91+
; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s
9392
; CHECK-NEXT: ret
9493
%v4half= shufflevector <2 x half> %A, <2 x half> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
9594
ret <4 x half> %v4half
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
3+
4+
define <4 x half> @shuffle1(<2 x half> %a, <2 x half> %b) {
5+
; CHECK-LABEL: shuffle1:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s
8+
; CHECK-NEXT: ret
9+
entry:
10+
%res = shufflevector <2 x half> %a, <2 x half> %b, <4 x i32> <i32 2, i32 3, i32 0, i32 undef>
11+
ret <4 x half> %res
12+
}
13+
14+
define <4 x half> @shuffle2(<2 x half> %a, <2 x half> %b) {
15+
; CHECK-LABEL: shuffle2:
16+
; CHECK: // %bb.0: // %entry
17+
; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s
18+
; CHECK-NEXT: ret
19+
entry:
20+
%res = shufflevector <2 x half> %a, <2 x half> %b, <4 x i32> <i32 undef, i32 1, i32 2, i32 undef>
21+
ret <4 x half> %res
22+
}
23+
24+
define <4 x i32> @shuffle3(<4 x i32> %a, <4 x i32> %b) {
25+
; CHECK-LABEL: shuffle3:
26+
; CHECK: // %bb.0: // %entry
27+
; CHECK-NEXT: mov v0.d[0], v1.d[1]
28+
; CHECK-NEXT: ret
29+
entry:
30+
%res = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
31+
ret <4 x i32> %res
32+
}
33+
34+
define <4 x float> @shuffle4(<4 x float> %a, <4 x float> %b) {
35+
; CHECK-LABEL: shuffle4:
36+
; CHECK: // %bb.0: // %entry
37+
; CHECK-NEXT: mov v0.d[1], v1.d[1]
38+
; CHECK-NEXT: ret
39+
entry:
40+
%res = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
41+
ret <4 x float> %res
42+
}
43+
44+
define <16 x i8> @shuffle5(<16 x i8> %a, <16 x i8> %b) {
45+
; CHECK-LABEL: shuffle5:
46+
; CHECK: // %bb.0: // %entry
47+
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
48+
; CHECK-NEXT: ret
49+
entry:
50+
%res = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 4, i32 5,
51+
i32 8, i32 9, i32 12, i32 13,
52+
i32 16, i32 17, i32 20, i32 21,
53+
i32 24, i32 25, i32 28, i32 29>
54+
ret <16 x i8> %res
55+
}
56+
57+
define <16 x i8> @shuffle6(<16 x i8> %a, <16 x i8> %b) {
58+
; CHECK-LABEL: shuffle6:
59+
; CHECK: // %bb.0: // %entry
60+
; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h
61+
; CHECK-NEXT: ret
62+
entry:
63+
%res = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 17,
64+
i32 4, i32 5, i32 20, i32 21,
65+
i32 8, i32 9, i32 24, i32 25,
66+
i32 12, i32 13, i32 28, i32 29>
67+
ret <16 x i8> %res
68+
}
69+
70+
define <8 x i8> @shuffle7(<8 x i8> %a, <8 x i8> %b) {
71+
; CHECK-LABEL: shuffle7:
72+
; CHECK: // %bb.0: // %entry
73+
; CHECK-NEXT: uzp2 v0.4h, v0.4h, v1.4h
74+
; CHECK-NEXT: ret
75+
entry:
76+
%res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 6, i32 undef,
77+
i32 undef, i32 11, i32 14, i32 undef>
78+
ret <8 x i8> %res
79+
}
80+
81+
define <8 x i8> @shuffle8(<8 x i8> %a, <8 x i8> %b) {
82+
; CHECK-LABEL: shuffle8:
83+
; CHECK: // %bb.0: // %entry
84+
; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h
85+
; CHECK-NEXT: ret
86+
entry:
87+
%res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 undef,
88+
i32 undef, i32 7, i32 14, i32 undef>
89+
ret <8 x i8> %res
90+
}
91+
92+
; No blocks
93+
define <8 x i8> @shuffle9(<8 x i8> %a) {
94+
; CHECK-LABEL: shuffle9:
95+
; CHECK: // %bb.0:
96+
; CHECK-NEXT: rev32 v0.4h, v0.4h
97+
; CHECK-NEXT: ret
98+
%res = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1,
99+
i32 6, i32 7, i32 4, i32 5>
100+
ret <8 x i8> %res
101+
}
102+
103+
define <8 x i16> @shuffle10(<8 x i16> %a) {
104+
; CHECK-LABEL: shuffle10:
105+
; CHECK: // %bb.0:
106+
; CHECK-NEXT: rev64 v0.4s, v0.4s
107+
; CHECK-NEXT: ret
108+
%res = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1,
109+
i32 undef, i32 undef, i32 4, i32 5>
110+
ret <8 x i16> %res
111+
}
112+
113+
define <4 x i16> @shuffle11(<8 x i16> %a, <8 x i16> %b) {
114+
; CHECK-LABEL: shuffle11:
115+
; CHECK: // %bb.0: // %entry
116+
; CHECK-NEXT: mov v1.s[1], v0.s[0]
117+
; CHECK-NEXT: fmov d0, d1
118+
; CHECK-NEXT: ret
119+
entry:
120+
%res = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 8, i32 9, i32 0, i32 1>
121+
ret <4 x i16> %res
122+
}
123+
124+
define <8 x i8> @shuffle12(<8 x i8> %a, <8 x i8> %b) {
125+
; CHECK-LABEL: shuffle12:
126+
; CHECK: // %bb.0: // %entry
127+
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h
128+
; CHECK-NEXT: trn2 v0.4h, v0.4h, v0.4h
129+
; CHECK-NEXT: ret
130+
entry:
131+
%res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 5, i32 4, i32 undef,
132+
i32 undef, i32 13, i32 12, i32 undef>
133+
ret <8 x i8> %res
134+
}
135+
136+
define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) {
137+
; CHECK-LABEL: shuffle_widen_faili1:
138+
; CHECK: // %bb.0: // %entry
139+
; CHECK-NEXT: rev32 v2.4h, v0.4h
140+
; CHECK-NEXT: rev32 v3.4h, v1.4h
141+
; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4
142+
; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4
143+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
144+
; CHECK-NEXT: ret
145+
entry:
146+
%res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 7, i32 6, i32 0, i32 1,
147+
i32 3, i32 2, i32 4, i32 5>
148+
ret <8 x i16> %res
149+
}
150+
151+
define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) {
152+
; CHECK-LABEL: shuffle_widen_fail2:
153+
; CHECK: // %bb.0: // %entry
154+
; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
155+
; CHECK-NEXT: trn1 v3.4h, v1.4h, v1.4h
156+
; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4
157+
; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4
158+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
159+
; CHECK-NEXT: ret
160+
entry:
161+
%res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 6, i32 6, i32 0, i32 1,
162+
i32 undef, i32 2, i32 4, i32 5>
163+
ret <8 x i16> %res
164+
}
165+
166+
define <8 x i16> @shuffle_widen_fail3(<8 x i16> %a, <8 x i16> %b) {
167+
; CHECK-LABEL: shuffle_widen_fail3:
168+
; CHECK: // %bb.0: // %entry
169+
; CHECK-NEXT: adrp x8, .LCPI14_0
170+
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
171+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
172+
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0]
173+
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
174+
; CHECK-NEXT: ret
175+
entry:
176+
%res = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 5, i32 12, i32 14,
177+
i32 10, i32 6, i32 7, i32 13>
178+
ret <8 x i16> %res
179+
}

llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,7 @@ define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) #0 {
460460
; Don't use SVE for 64-bit vectors.
461461
define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) #0 {
462462
; CHECK-LABEL: concat_v4f16:
463-
; CHECK: ext v0.8b, v0.8b, v0.8b, #4
464-
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4
463+
; CHECK: zip1 v0.2s, v0.2s, v1.2s
465464
; CHECK-NEXT: ret
466465
%res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
467466
ret <4 x half> %res

0 commit comments

Comments
 (0)