1
- ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
+ ; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=CHECKLE
3
+ ; RUN: llc < %s -mtriple=aarch64_be-none-eabi | FileCheck %s --check-prefixes=CHECKBE
2
4
3
5
define <8 x i8 > @vtrni8 (<8 x i8 >* %A , <8 x i8 >* %B ) nounwind {
4
- ;CHECK-LABEL: vtrni8:
5
- ;CHECK: trn1.8b
6
- ;CHECK: trn2.8b
7
- ;CHECK-NEXT: add.8b
6
+ ; CHECKLE-LABEL: vtrni8:
7
+ ; CHECKLE: // %bb.0:
8
+ ; CHECKLE-NEXT: ldr d0, [x0]
9
+ ; CHECKLE-NEXT: ldr d1, [x1]
10
+ ; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b
11
+ ; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b
12
+ ; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
13
+ ; CHECKLE-NEXT: ret
14
+ ;
15
+ ; CHECKBE-LABEL: vtrni8:
16
+ ; CHECKBE: // %bb.0:
17
+ ; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
18
+ ; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
19
+ ; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b
20
+ ; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b
21
+ ; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
22
+ ; CHECKBE-NEXT: rev64 v0.8b, v0.8b
23
+ ; CHECKBE-NEXT: ret
8
24
%tmp1 = load <8 x i8 >, <8 x i8 >* %A
9
25
%tmp2 = load <8 x i8 >, <8 x i8 >* %B
10
26
%tmp3 = shufflevector <8 x i8 > %tmp1 , <8 x i8 > %tmp2 , <8 x i32 > <i32 0 , i32 8 , i32 2 , i32 10 , i32 4 , i32 12 , i32 6 , i32 14 >
@@ -14,10 +30,24 @@ define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
14
30
}
15
31
16
32
define <4 x i16 > @vtrni16 (<4 x i16 >* %A , <4 x i16 >* %B ) nounwind {
17
- ;CHECK-LABEL: vtrni16:
18
- ;CHECK: trn1.4h
19
- ;CHECK: trn2.4h
20
- ;CHECK-NEXT: add.4h
33
+ ; CHECKLE-LABEL: vtrni16:
34
+ ; CHECKLE: // %bb.0:
35
+ ; CHECKLE-NEXT: ldr d0, [x0]
36
+ ; CHECKLE-NEXT: ldr d1, [x1]
37
+ ; CHECKLE-NEXT: trn1 v2.4h, v0.4h, v1.4h
38
+ ; CHECKLE-NEXT: trn2 v0.4h, v0.4h, v1.4h
39
+ ; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h
40
+ ; CHECKLE-NEXT: ret
41
+ ;
42
+ ; CHECKBE-LABEL: vtrni16:
43
+ ; CHECKBE: // %bb.0:
44
+ ; CHECKBE-NEXT: ld1 { v0.4h }, [x0]
45
+ ; CHECKBE-NEXT: ld1 { v1.4h }, [x1]
46
+ ; CHECKBE-NEXT: trn1 v2.4h, v0.4h, v1.4h
47
+ ; CHECKBE-NEXT: trn2 v0.4h, v0.4h, v1.4h
48
+ ; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h
49
+ ; CHECKBE-NEXT: rev64 v0.4h, v0.4h
50
+ ; CHECKBE-NEXT: ret
21
51
%tmp1 = load <4 x i16 >, <4 x i16 >* %A
22
52
%tmp2 = load <4 x i16 >, <4 x i16 >* %B
23
53
%tmp3 = shufflevector <4 x i16 > %tmp1 , <4 x i16 > %tmp2 , <4 x i32 > <i32 0 , i32 4 , i32 2 , i32 6 >
@@ -26,12 +56,49 @@ define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
26
56
ret <4 x i16 > %tmp5
27
57
}
28
58
59
+ define <8 x i8 > @vtrni16_viabitcast (<4 x i16 > *%A , <4 x i16 > *%B ) nounwind {
60
+ ; CHECKLE-LABEL: vtrni16_viabitcast:
61
+ ; CHECKLE: // %bb.0:
62
+ ; CHECKLE-NEXT: ldr d0, [x0]
63
+ ; CHECKLE-NEXT: ldr d1, [x1]
64
+ ; CHECKLE-NEXT: trn1 v0.4h, v0.4h, v1.4h
65
+ ; CHECKLE-NEXT: ret
66
+ ;
67
+ ; CHECKBE-LABEL: vtrni16_viabitcast:
68
+ ; CHECKBE: // %bb.0:
69
+ ; CHECKBE-NEXT: ld1 { v0.4h }, [x0]
70
+ ; CHECKBE-NEXT: ld1 { v1.4h }, [x1]
71
+ ; CHECKBE-NEXT: trn1 v0.4h, v0.4h, v1.4h
72
+ ; CHECKBE-NEXT: rev64 v0.4h, v0.4h
73
+ ; CHECKBE-NEXT: ret
74
+ %l1 = load <4 x i16 >, <4 x i16 > *%A
75
+ %l2 = load <4 x i16 >, <4 x i16 > *%B
76
+ %b1 = bitcast <4 x i16 > %l1 to <8 x i8 >
77
+ %b2 = bitcast <4 x i16 > %l2 to <8 x i8 >
78
+ %tmp3 = shufflevector <8 x i8 > %b1 , <8 x i8 > %b2 , <8 x i32 > <i32 0 , i32 1 , i32 8 , i32 9 , i32 4 , i32 5 , i32 12 , i32 13 >
79
+ ret <8 x i8 > %tmp3
80
+ }
81
+
29
82
; 2xi32 TRN is redundant with ZIP
30
83
define <2 x i32 > @vtrni32 (<2 x i32 >* %A , <2 x i32 >* %B ) nounwind {
31
- ;CHECK-LABEL: vtrni32:
32
- ;CHECK: zip1.2s
33
- ;CHECK: zip2.2s
34
- ;CHECK-NEXT: add.2s
84
+ ; CHECKLE-LABEL: vtrni32:
85
+ ; CHECKLE: // %bb.0:
86
+ ; CHECKLE-NEXT: ldr d0, [x0]
87
+ ; CHECKLE-NEXT: ldr d1, [x1]
88
+ ; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s
89
+ ; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s
90
+ ; CHECKLE-NEXT: add v0.2s, v2.2s, v0.2s
91
+ ; CHECKLE-NEXT: ret
92
+ ;
93
+ ; CHECKBE-LABEL: vtrni32:
94
+ ; CHECKBE: // %bb.0:
95
+ ; CHECKBE-NEXT: ld1 { v0.2s }, [x0]
96
+ ; CHECKBE-NEXT: ld1 { v1.2s }, [x1]
97
+ ; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s
98
+ ; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s
99
+ ; CHECKBE-NEXT: add v0.2s, v2.2s, v0.2s
100
+ ; CHECKBE-NEXT: rev64 v0.2s, v0.2s
101
+ ; CHECKBE-NEXT: ret
35
102
%tmp1 = load <2 x i32 >, <2 x i32 >* %A
36
103
%tmp2 = load <2 x i32 >, <2 x i32 >* %B
37
104
%tmp3 = shufflevector <2 x i32 > %tmp1 , <2 x i32 > %tmp2 , <2 x i32 > <i32 0 , i32 2 >
@@ -41,10 +108,24 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
41
108
}
42
109
43
110
define <2 x float > @vtrnf (<2 x float >* %A , <2 x float >* %B ) nounwind {
44
- ;CHECK-LABEL: vtrnf:
45
- ;CHECK: zip1.2s
46
- ;CHECK: zip2.2s
47
- ;CHECK-NEXT: fadd.2s
111
+ ; CHECKLE-LABEL: vtrnf:
112
+ ; CHECKLE: // %bb.0:
113
+ ; CHECKLE-NEXT: ldr d0, [x0]
114
+ ; CHECKLE-NEXT: ldr d1, [x1]
115
+ ; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s
116
+ ; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s
117
+ ; CHECKLE-NEXT: fadd v0.2s, v2.2s, v0.2s
118
+ ; CHECKLE-NEXT: ret
119
+ ;
120
+ ; CHECKBE-LABEL: vtrnf:
121
+ ; CHECKBE: // %bb.0:
122
+ ; CHECKBE-NEXT: ld1 { v0.2s }, [x0]
123
+ ; CHECKBE-NEXT: ld1 { v1.2s }, [x1]
124
+ ; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s
125
+ ; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s
126
+ ; CHECKBE-NEXT: fadd v0.2s, v2.2s, v0.2s
127
+ ; CHECKBE-NEXT: rev64 v0.2s, v0.2s
128
+ ; CHECKBE-NEXT: ret
48
129
%tmp1 = load <2 x float >, <2 x float >* %A
49
130
%tmp2 = load <2 x float >, <2 x float >* %B
50
131
%tmp3 = shufflevector <2 x float > %tmp1 , <2 x float > %tmp2 , <2 x i32 > <i32 0 , i32 2 >
@@ -54,10 +135,25 @@ define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
54
135
}
55
136
56
137
define <16 x i8 > @vtrnQi8 (<16 x i8 >* %A , <16 x i8 >* %B ) nounwind {
57
- ;CHECK-LABEL: vtrnQi8:
58
- ;CHECK: trn1.16b
59
- ;CHECK: trn2.16b
60
- ;CHECK-NEXT: add.16b
138
+ ; CHECKLE-LABEL: vtrnQi8:
139
+ ; CHECKLE: // %bb.0:
140
+ ; CHECKLE-NEXT: ldr q0, [x0]
141
+ ; CHECKLE-NEXT: ldr q1, [x1]
142
+ ; CHECKLE-NEXT: trn1 v2.16b, v0.16b, v1.16b
143
+ ; CHECKLE-NEXT: trn2 v0.16b, v0.16b, v1.16b
144
+ ; CHECKLE-NEXT: add v0.16b, v2.16b, v0.16b
145
+ ; CHECKLE-NEXT: ret
146
+ ;
147
+ ; CHECKBE-LABEL: vtrnQi8:
148
+ ; CHECKBE: // %bb.0:
149
+ ; CHECKBE-NEXT: ld1 { v0.16b }, [x0]
150
+ ; CHECKBE-NEXT: ld1 { v1.16b }, [x1]
151
+ ; CHECKBE-NEXT: trn1 v2.16b, v0.16b, v1.16b
152
+ ; CHECKBE-NEXT: trn2 v0.16b, v0.16b, v1.16b
153
+ ; CHECKBE-NEXT: add v0.16b, v2.16b, v0.16b
154
+ ; CHECKBE-NEXT: rev64 v0.16b, v0.16b
155
+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
156
+ ; CHECKBE-NEXT: ret
61
157
%tmp1 = load <16 x i8 >, <16 x i8 >* %A
62
158
%tmp2 = load <16 x i8 >, <16 x i8 >* %B
63
159
%tmp3 = shufflevector <16 x i8 > %tmp1 , <16 x i8 > %tmp2 , <16 x i32 > <i32 0 , i32 16 , i32 2 , i32 18 , i32 4 , i32 20 , i32 6 , i32 22 , i32 8 , i32 24 , i32 10 , i32 26 , i32 12 , i32 28 , i32 14 , i32 30 >
@@ -67,10 +163,25 @@ define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
67
163
}
68
164
69
165
define <8 x i16 > @vtrnQi16 (<8 x i16 >* %A , <8 x i16 >* %B ) nounwind {
70
- ;CHECK-LABEL: vtrnQi16:
71
- ;CHECK: trn1.8h
72
- ;CHECK: trn2.8h
73
- ;CHECK-NEXT: add.8h
166
+ ; CHECKLE-LABEL: vtrnQi16:
167
+ ; CHECKLE: // %bb.0:
168
+ ; CHECKLE-NEXT: ldr q0, [x0]
169
+ ; CHECKLE-NEXT: ldr q1, [x1]
170
+ ; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h
171
+ ; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h
172
+ ; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h
173
+ ; CHECKLE-NEXT: ret
174
+ ;
175
+ ; CHECKBE-LABEL: vtrnQi16:
176
+ ; CHECKBE: // %bb.0:
177
+ ; CHECKBE-NEXT: ld1 { v0.8h }, [x0]
178
+ ; CHECKBE-NEXT: ld1 { v1.8h }, [x1]
179
+ ; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h
180
+ ; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h
181
+ ; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h
182
+ ; CHECKBE-NEXT: rev64 v0.8h, v0.8h
183
+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
184
+ ; CHECKBE-NEXT: ret
74
185
%tmp1 = load <8 x i16 >, <8 x i16 >* %A
75
186
%tmp2 = load <8 x i16 >, <8 x i16 >* %B
76
187
%tmp3 = shufflevector <8 x i16 > %tmp1 , <8 x i16 > %tmp2 , <8 x i32 > <i32 0 , i32 8 , i32 2 , i32 10 , i32 4 , i32 12 , i32 6 , i32 14 >
@@ -80,10 +191,25 @@ define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
80
191
}
81
192
82
193
define <4 x i32 > @vtrnQi32 (<4 x i32 >* %A , <4 x i32 >* %B ) nounwind {
83
- ;CHECK-LABEL: vtrnQi32:
84
- ;CHECK: trn1.4s
85
- ;CHECK: trn2.4s
86
- ;CHECK-NEXT: add.4s
194
+ ; CHECKLE-LABEL: vtrnQi32:
195
+ ; CHECKLE: // %bb.0:
196
+ ; CHECKLE-NEXT: ldr q0, [x0]
197
+ ; CHECKLE-NEXT: ldr q1, [x1]
198
+ ; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s
199
+ ; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s
200
+ ; CHECKLE-NEXT: add v0.4s, v2.4s, v0.4s
201
+ ; CHECKLE-NEXT: ret
202
+ ;
203
+ ; CHECKBE-LABEL: vtrnQi32:
204
+ ; CHECKBE: // %bb.0:
205
+ ; CHECKBE-NEXT: ld1 { v0.4s }, [x0]
206
+ ; CHECKBE-NEXT: ld1 { v1.4s }, [x1]
207
+ ; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s
208
+ ; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s
209
+ ; CHECKBE-NEXT: add v0.4s, v2.4s, v0.4s
210
+ ; CHECKBE-NEXT: rev64 v0.4s, v0.4s
211
+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
212
+ ; CHECKBE-NEXT: ret
87
213
%tmp1 = load <4 x i32 >, <4 x i32 >* %A
88
214
%tmp2 = load <4 x i32 >, <4 x i32 >* %B
89
215
%tmp3 = shufflevector <4 x i32 > %tmp1 , <4 x i32 > %tmp2 , <4 x i32 > <i32 0 , i32 4 , i32 2 , i32 6 >
@@ -93,10 +219,25 @@ define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
93
219
}
94
220
95
221
define <4 x float > @vtrnQf (<4 x float >* %A , <4 x float >* %B ) nounwind {
96
- ;CHECK-LABEL: vtrnQf:
97
- ;CHECK: trn1.4s
98
- ;CHECK: trn2.4s
99
- ;CHECK-NEXT: fadd.4s
222
+ ; CHECKLE-LABEL: vtrnQf:
223
+ ; CHECKLE: // %bb.0:
224
+ ; CHECKLE-NEXT: ldr q0, [x0]
225
+ ; CHECKLE-NEXT: ldr q1, [x1]
226
+ ; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s
227
+ ; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s
228
+ ; CHECKLE-NEXT: fadd v0.4s, v2.4s, v0.4s
229
+ ; CHECKLE-NEXT: ret
230
+ ;
231
+ ; CHECKBE-LABEL: vtrnQf:
232
+ ; CHECKBE: // %bb.0:
233
+ ; CHECKBE-NEXT: ld1 { v0.4s }, [x0]
234
+ ; CHECKBE-NEXT: ld1 { v1.4s }, [x1]
235
+ ; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s
236
+ ; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s
237
+ ; CHECKBE-NEXT: fadd v0.4s, v2.4s, v0.4s
238
+ ; CHECKBE-NEXT: rev64 v0.4s, v0.4s
239
+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
240
+ ; CHECKBE-NEXT: ret
100
241
%tmp1 = load <4 x float >, <4 x float >* %A
101
242
%tmp2 = load <4 x float >, <4 x float >* %B
102
243
%tmp3 = shufflevector <4 x float > %tmp1 , <4 x float > %tmp2 , <4 x i32 > <i32 0 , i32 4 , i32 2 , i32 6 >
@@ -108,10 +249,24 @@ define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
108
249
; Undef shuffle indices should not prevent matching to VTRN:
109
250
110
251
define <8 x i8 > @vtrni8_undef (<8 x i8 >* %A , <8 x i8 >* %B ) nounwind {
111
- ;CHECK-LABEL: vtrni8_undef:
112
- ;CHECK: trn1.8b
113
- ;CHECK: trn2.8b
114
- ;CHECK-NEXT: add.8b
252
+ ; CHECKLE-LABEL: vtrni8_undef:
253
+ ; CHECKLE: // %bb.0:
254
+ ; CHECKLE-NEXT: ldr d0, [x0]
255
+ ; CHECKLE-NEXT: ldr d1, [x1]
256
+ ; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b
257
+ ; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b
258
+ ; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
259
+ ; CHECKLE-NEXT: ret
260
+ ;
261
+ ; CHECKBE-LABEL: vtrni8_undef:
262
+ ; CHECKBE: // %bb.0:
263
+ ; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
264
+ ; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
265
+ ; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b
266
+ ; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b
267
+ ; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
268
+ ; CHECKBE-NEXT: rev64 v0.8b, v0.8b
269
+ ; CHECKBE-NEXT: ret
115
270
%tmp1 = load <8 x i8 >, <8 x i8 >* %A
116
271
%tmp2 = load <8 x i8 >, <8 x i8 >* %B
117
272
%tmp3 = shufflevector <8 x i8 > %tmp1 , <8 x i8 > %tmp2 , <8 x i32 > <i32 0 , i32 undef , i32 2 , i32 10 , i32 undef , i32 12 , i32 6 , i32 14 >
@@ -121,10 +276,25 @@ define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
121
276
}
122
277
123
278
define <8 x i16 > @vtrnQi16_undef (<8 x i16 >* %A , <8 x i16 >* %B ) nounwind {
124
- ;CHECK-LABEL: vtrnQi16_undef:
125
- ;CHECK: trn1.8h
126
- ;CHECK: trn2.8h
127
- ;CHECK-NEXT: add.8h
279
+ ; CHECKLE-LABEL: vtrnQi16_undef:
280
+ ; CHECKLE: // %bb.0:
281
+ ; CHECKLE-NEXT: ldr q0, [x0]
282
+ ; CHECKLE-NEXT: ldr q1, [x1]
283
+ ; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h
284
+ ; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h
285
+ ; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h
286
+ ; CHECKLE-NEXT: ret
287
+ ;
288
+ ; CHECKBE-LABEL: vtrnQi16_undef:
289
+ ; CHECKBE: // %bb.0:
290
+ ; CHECKBE-NEXT: ld1 { v0.8h }, [x0]
291
+ ; CHECKBE-NEXT: ld1 { v1.8h }, [x1]
292
+ ; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h
293
+ ; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h
294
+ ; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h
295
+ ; CHECKBE-NEXT: rev64 v0.8h, v0.8h
296
+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
297
+ ; CHECKBE-NEXT: ret
128
298
%tmp1 = load <8 x i16 >, <8 x i16 >* %A
129
299
%tmp2 = load <8 x i16 >, <8 x i16 >* %B
130
300
%tmp3 = shufflevector <8 x i16 > %tmp1 , <8 x i16 > %tmp2 , <8 x i32 > <i32 0 , i32 8 , i32 undef , i32 undef , i32 4 , i32 12 , i32 6 , i32 14 >
0 commit comments