Skip to content

Commit f42de69

Browse files
committed
[X86] vector-shuffle-512-v16.ll - add fast shuffle test coverage
1 parent 9c42ed1 commit f42de69

File tree

1 file changed

+125
-56
lines changed

1 file changed

+125
-56
lines changed

llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll

Lines changed: 125 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
3-
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
2+
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
3+
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
4+
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
5+
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW
46

57
target triple = "x86_64-unknown-unknown"
68

@@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
1416
}
1517

1618
define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
17-
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
18-
; ALL: # %bb.0:
19-
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
20-
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
21-
; ALL-NEXT: retq
19+
; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
20+
; SLOW: # %bb.0:
21+
; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
22+
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
23+
; SLOW-NEXT: retq
24+
;
25+
; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
26+
; FAST: # %bb.0:
27+
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
28+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
29+
; FAST-NEXT: retq
2230
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
2331
ret <16 x float> %shuffle
2432
}
2533

2634
define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
27-
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
28-
; ALL: # %bb.0:
29-
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
30-
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
31-
; ALL-NEXT: retq
35+
; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
36+
; SLOW: # %bb.0:
37+
; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
38+
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
39+
; SLOW-NEXT: retq
40+
;
41+
; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
42+
; FAST: # %bb.0:
43+
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
44+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
45+
; FAST-NEXT: retq
3246
%tmp0 = bitcast <16 x i32> %a to <16 x float>
3347
%tmp1 = bitcast <16 x i32> %b to <16 x float>
3448
%shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1
196210

197211
; PR86076
198212
define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08(float %a0, float %a1) {
199-
; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
200-
; ALL: # %bb.0:
201-
; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
202-
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
203-
; ALL-NEXT: retq
213+
; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
214+
; SLOW: # %bb.0:
215+
; SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
216+
; SLOW-NEXT: vbroadcastsd %xmm0, %zmm0
217+
; SLOW-NEXT: retq
218+
;
219+
; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
220+
; FAST: # %bb.0:
221+
; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
222+
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
223+
; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
224+
; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
225+
; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
226+
; FAST-NEXT: retq
204227
%v0 = insertelement <8 x float> poison, float %a0, i64 0
205228
%v1 = insertelement <8 x float> poison, float %a1, i64 0
206229
%sv = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
@@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
217240
}
218241

219242
define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
220-
; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
221-
; ALL: # %bb.0:
222-
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
223-
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
224-
; ALL-NEXT: retq
243+
; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
244+
; SLOW: # %bb.0:
245+
; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
246+
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
247+
; SLOW-NEXT: retq
248+
;
249+
; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
250+
; FAST: # %bb.0:
251+
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
252+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
253+
; FAST-NEXT: retq
225254
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
226255
ret <16 x i32> %shuffle
227256
}
@@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
302331

303332
; PR46249
304333
define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x i32> %a) {
305-
; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
306-
; ALL: # %bb.0:
307-
; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
308-
; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
309-
; ALL-NEXT: retq
334+
; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
335+
; SLOW: # %bb.0:
336+
; SLOW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
337+
; SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
338+
; SLOW-NEXT: retq
339+
;
340+
; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
341+
; FAST: # %bb.0:
342+
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
343+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
344+
; FAST-NEXT: retq
310345
%1 = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
311346
ret <16 x i32> %1
312347
}
313348

314349
define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) {
315-
; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
316-
; ALL: # %bb.0:
317-
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
318-
; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
319-
; ALL-NEXT: retq
350+
; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
351+
; SLOW: # %bb.0:
352+
; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
353+
; SLOW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
354+
; SLOW-NEXT: retq
355+
;
356+
; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
357+
; FAST: # %bb.0:
358+
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
359+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
360+
; FAST-NEXT: retq
320361
%1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
321362
ret <16 x float> %1
322363
}
@@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
333374
}
334375

335376
define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, ptr %a1) {
336-
; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
337-
; ALL: # %bb.0:
338-
; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
339-
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
340-
; ALL-NEXT: retq
377+
; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
378+
; SLOW: # %bb.0:
379+
; SLOW-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
380+
; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
381+
; SLOW-NEXT: retq
382+
;
383+
; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
384+
; FAST: # %bb.0:
385+
; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
386+
; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
387+
; FAST-NEXT: retq
341388
%1 = load <16 x float>, ptr %a1
342389
%2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>
343390
ret <16 x float> %2
@@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
365412

366413
;FIXME: can do better with vpcompress
367414
define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
368-
; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
369-
; ALL: # %bb.0:
370-
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
371-
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
372-
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
373-
; ALL-NEXT: retq
415+
; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
416+
; SLOW: # %bb.0:
417+
; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
418+
; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
419+
; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
420+
; SLOW-NEXT: retq
421+
;
422+
; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
423+
; FAST: # %bb.0:
424+
; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
425+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
426+
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
427+
; FAST-NEXT: retq
374428
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
375429
ret <8 x i32> %res
376430
}
377431

378432
;FIXME: can do better with vpcompress
379433
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
380-
; ALL-LABEL: test_v16i32_0_1_2_12:
381-
; ALL: # %bb.0:
382-
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
383-
; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
384-
; ALL-NEXT: vbroadcastss %xmm1, %xmm1
385-
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
386-
; ALL-NEXT: vzeroupper
387-
; ALL-NEXT: retq
434+
; SLOW-LABEL: test_v16i32_0_1_2_12:
435+
; SLOW: # %bb.0:
436+
; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
437+
; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
438+
; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
439+
; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
440+
; SLOW-NEXT: vzeroupper
441+
; SLOW-NEXT: retq
442+
;
443+
; FAST-LABEL: test_v16i32_0_1_2_12:
444+
; FAST: # %bb.0:
445+
; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
446+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
447+
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
448+
; FAST-NEXT: vzeroupper
449+
; FAST-NEXT: retq
388450
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
389451
ret <4 x i32> %res
390452
}
@@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
568630
}
569631

570632
define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
571-
; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
572-
; ALL: # %bb.0:
573-
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
574-
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
575-
; ALL-NEXT: retq
633+
; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
634+
; SLOW: # %bb.0:
635+
; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
636+
; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
637+
; SLOW-NEXT: retq
638+
;
639+
; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
640+
; FAST: # %bb.0:
641+
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
642+
; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
643+
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
644+
; FAST-NEXT: retq
576645
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
577646
ret <16 x float> %shuffle
578647
}

0 commit comments

Comments
 (0)