1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
- ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
3
- ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
2
+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
3
+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
4
+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
5
+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW
4
6
5
7
target triple = "x86_64-unknown-unknown"
6
8
@@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
14
16
}
15
17
16
18
define <16 x float > @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08 (<16 x float > %a , <16 x float > %b ) {
17
- ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
18
- ; ALL: # %bb.0:
19
- ; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
20
- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
21
- ; ALL-NEXT: retq
19
+ ; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
20
+ ; SLOW: # %bb.0:
21
+ ; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
22
+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
23
+ ; SLOW-NEXT: retq
24
+ ;
25
+ ; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
26
+ ; FAST: # %bb.0:
27
+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
28
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
29
+ ; FAST-NEXT: retq
22
30
%shuffle = shufflevector <16 x float > %a , <16 x float > %b , <16 x i32 ><i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 >
23
31
ret <16 x float > %shuffle
24
32
}
25
33
26
34
define <16 x float > @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc (<16 x i32 > %a , <16 x i32 > %b ) {
27
- ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
28
- ; ALL: # %bb.0:
29
- ; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
30
- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
31
- ; ALL-NEXT: retq
35
+ ; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
36
+ ; SLOW: # %bb.0:
37
+ ; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
38
+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
39
+ ; SLOW-NEXT: retq
40
+ ;
41
+ ; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
42
+ ; FAST: # %bb.0:
43
+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
44
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
45
+ ; FAST-NEXT: retq
32
46
%tmp0 = bitcast <16 x i32 > %a to <16 x float >
33
47
%tmp1 = bitcast <16 x i32 > %b to <16 x float >
34
48
%shuffle = shufflevector <16 x float > %tmp0 , <16 x float > %tmp1 , <16 x i32 ><i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 >
@@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1
196
210
197
211
; PR86076
198
212
define <16 x float > @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08 (float %a0 , float %a1 ) {
199
- ; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
200
- ; ALL: # %bb.0:
201
- ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
202
- ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
203
- ; ALL-NEXT: retq
213
+ ; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
214
+ ; SLOW: # %bb.0:
215
+ ; SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
216
+ ; SLOW-NEXT: vbroadcastsd %xmm0, %zmm0
217
+ ; SLOW-NEXT: retq
218
+ ;
219
+ ; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
220
+ ; FAST: # %bb.0:
221
+ ; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
222
+ ; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
223
+ ; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
224
+ ; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
225
+ ; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
226
+ ; FAST-NEXT: retq
204
227
%v0 = insertelement <8 x float > poison, float %a0 , i64 0
205
228
%v1 = insertelement <8 x float > poison, float %a1 , i64 0
206
229
%sv = shufflevector <8 x float > %v0 , <8 x float > %v1 , <16 x i32 > <i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 >
@@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
217
240
}
218
241
219
242
define <16 x i32 > @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04 (<16 x i32 > %a , <16 x i32 > %b ) {
220
- ; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
221
- ; ALL: # %bb.0:
222
- ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
223
- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
224
- ; ALL-NEXT: retq
243
+ ; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
244
+ ; SLOW: # %bb.0:
245
+ ; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
246
+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
247
+ ; SLOW-NEXT: retq
248
+ ;
249
+ ; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
250
+ ; FAST: # %bb.0:
251
+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
252
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
253
+ ; FAST-NEXT: retq
225
254
%shuffle = shufflevector <16 x i32 > %a , <16 x i32 > %b , <16 x i32 ><i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 >
226
255
ret <16 x i32 > %shuffle
227
256
}
@@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
302
331
303
332
; PR46249
304
333
define <16 x i32 > @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04 (<16 x i32 > %a ) {
305
- ; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
306
- ; ALL: # %bb.0:
307
- ; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
308
- ; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
309
- ; ALL-NEXT: retq
334
+ ; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
335
+ ; SLOW: # %bb.0:
336
+ ; SLOW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
337
+ ; SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
338
+ ; SLOW-NEXT: retq
339
+ ;
340
+ ; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
341
+ ; FAST: # %bb.0:
342
+ ; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
343
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
344
+ ; FAST-NEXT: retq
310
345
%1 = shufflevector <16 x i32 > %a , <16 x i32 > undef , <16 x i32 > <i32 11 , i32 10 , i32 9 , i32 8 , i32 15 , i32 14 , i32 13 , i32 12 , i32 3 , i32 2 , i32 1 , i32 0 , i32 7 , i32 6 , i32 5 , i32 4 >
311
346
ret <16 x i32 > %1
312
347
}
313
348
314
349
define <16 x float > @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04 (<16 x float > %a ) {
315
- ; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
316
- ; ALL: # %bb.0:
317
- ; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
318
- ; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
319
- ; ALL-NEXT: retq
350
+ ; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
351
+ ; SLOW: # %bb.0:
352
+ ; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
353
+ ; SLOW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
354
+ ; SLOW-NEXT: retq
355
+ ;
356
+ ; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
357
+ ; FAST: # %bb.0:
358
+ ; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
359
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
360
+ ; FAST-NEXT: retq
320
361
%1 = shufflevector <16 x float > %a , <16 x float > undef , <16 x i32 > <i32 11 , i32 10 , i32 9 , i32 8 , i32 15 , i32 14 , i32 13 , i32 12 , i32 3 , i32 2 , i32 1 , i32 0 , i32 7 , i32 6 , i32 5 , i32 4 >
321
362
ret <16 x float > %1
322
363
}
@@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
333
374
}
334
375
335
376
define <16 x float > @shuffle_v16f32_load_08_11_10_00_12_15_14_04 (<16 x float > %a0 , ptr %a1 ) {
336
- ; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
337
- ; ALL: # %bb.0:
338
- ; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
339
- ; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
340
- ; ALL-NEXT: retq
377
+ ; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
378
+ ; SLOW: # %bb.0:
379
+ ; SLOW-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
380
+ ; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
381
+ ; SLOW-NEXT: retq
382
+ ;
383
+ ; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
384
+ ; FAST: # %bb.0:
385
+ ; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
386
+ ; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
387
+ ; FAST-NEXT: retq
341
388
%1 = load <16 x float >, ptr %a1
342
389
%2 = shufflevector <16 x float > %1 , <16 x float > %a0 , <16 x i32 > <i32 16 , i32 19 , i32 18 , i32 0 , i32 20 , i32 23 , i32 22 , i32 4 , i32 24 , i32 27 , i32 26 , i32 8 , i32 28 , i32 31 , i32 30 , i32 12 >
343
390
ret <16 x float > %2
@@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
365
412
366
413
;FIXME: can do better with vpcompress
367
414
define <8 x i32 > @test_v16i32_1_3_5_7_9_11_13_15 (<16 x i32 > %v ) {
368
- ; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
369
- ; ALL: # %bb.0:
370
- ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
371
- ; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
372
- ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
373
- ; ALL-NEXT: retq
415
+ ; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
416
+ ; SLOW: # %bb.0:
417
+ ; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
418
+ ; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
419
+ ; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
420
+ ; SLOW-NEXT: retq
421
+ ;
422
+ ; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
423
+ ; FAST: # %bb.0:
424
+ ; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
425
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
426
+ ; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
427
+ ; FAST-NEXT: retq
374
428
%res = shufflevector <16 x i32 > %v , <16 x i32 > undef , <8 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 , i32 9 , i32 11 , i32 13 , i32 15 >
375
429
ret <8 x i32 > %res
376
430
}
377
431
378
432
;FIXME: can do better with vpcompress
379
433
define <4 x i32 > @test_v16i32_0_1_2_12 (<16 x i32 > %v ) {
380
- ; ALL-LABEL: test_v16i32_0_1_2_12:
381
- ; ALL: # %bb.0:
382
- ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
383
- ; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
384
- ; ALL-NEXT: vbroadcastss %xmm1, %xmm1
385
- ; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
386
- ; ALL-NEXT: vzeroupper
387
- ; ALL-NEXT: retq
434
+ ; SLOW-LABEL: test_v16i32_0_1_2_12:
435
+ ; SLOW: # %bb.0:
436
+ ; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
437
+ ; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
438
+ ; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
439
+ ; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
440
+ ; SLOW-NEXT: vzeroupper
441
+ ; SLOW-NEXT: retq
442
+ ;
443
+ ; FAST-LABEL: test_v16i32_0_1_2_12:
444
+ ; FAST: # %bb.0:
445
+ ; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
446
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
447
+ ; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
448
+ ; FAST-NEXT: vzeroupper
449
+ ; FAST-NEXT: retq
388
450
%res = shufflevector <16 x i32 > %v , <16 x i32 > undef , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 12 >
389
451
ret <4 x i32 > %res
390
452
}
@@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
568
630
}
569
631
570
632
define <16 x float > @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04 (<8 x float > %a ) {
571
- ; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
572
- ; ALL: # %bb.0:
573
- ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
574
- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
575
- ; ALL-NEXT: retq
633
+ ; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
634
+ ; SLOW: # %bb.0:
635
+ ; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
636
+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
637
+ ; SLOW-NEXT: retq
638
+ ;
639
+ ; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
640
+ ; FAST: # %bb.0:
641
+ ; FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
642
+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
643
+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
644
+ ; FAST-NEXT: retq
576
645
%shuffle = shufflevector <8 x float > %a , <8 x float > undef , <16 x i32 > <i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 >
577
646
ret <16 x float > %shuffle
578
647
}
0 commit comments