@@ -842,3 +842,110 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_
842
842
%5 = shufflevector <64 x i8 > %3 , <64 x i8 > %4 , <64 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 , i32 64 , i32 66 , i32 68 , i32 70 , i32 72 , i32 74 , i32 76 , i32 78 , i32 16 , i32 18 , i32 20 , i32 22 , i32 24 , i32 26 , i32 28 , i32 30 , i32 80 , i32 82 , i32 84 , i32 86 , i32 88 , i32 90 , i32 92 , i32 94 , i32 32 , i32 34 , i32 36 , i32 38 , i32 40 , i32 42 , i32 44 , i32 46 , i32 96 , i32 98 , i32 100 , i32 102 , i32 104 , i32 106 , i32 108 , i32 110 , i32 48 , i32 50 , i32 52 , i32 54 , i32 56 , i32 58 , i32 60 , i32 62 , i32 112 , i32 114 , i32 116 , i32 118 , i32 120 , i32 122 , i32 124 , i32 126 >
843
843
ret <64 x i8 > %5
844
844
}
845
+
846
+ define <64 x i8 > @PR54562_ref (<64 x i8 > %a0 ) {
847
+ ; AVX512F-LABEL: PR54562_ref:
848
+ ; AVX512F: # %bb.0:
849
+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
850
+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
851
+ ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
852
+ ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
853
+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
854
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
855
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
856
+ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
857
+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
858
+ ; AVX512F-NEXT: retq
859
+ ;
860
+ ; AVX512BW-LABEL: PR54562_ref:
861
+ ; AVX512BW: # %bb.0:
862
+ ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
863
+ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
864
+ ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
865
+ ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
866
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
867
+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
868
+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
869
+ ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
870
+ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
871
+ ; AVX512BW-NEXT: retq
872
+ ;
873
+ ; AVX512DQ-LABEL: PR54562_ref:
874
+ ; AVX512DQ: # %bb.0:
875
+ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
876
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
877
+ ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
878
+ ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
879
+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
880
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
881
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
882
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
883
+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
884
+ ; AVX512DQ-NEXT: retq
885
+ ;
886
+ ; AVX512VBMI-LABEL: PR54562_ref:
887
+ ; AVX512VBMI: # %bb.0:
888
+ ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,13,12,14,13,16,15,17,16,19,18,20,19,22,21,23,22,25,24,26,25,28,27,29,28,31,30,32,31,34,33,35,34,37,36,38,37,40,39,41,40,43,42,44,43,46,45,47,46]
889
+ ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
890
+ ; AVX512VBMI-NEXT: retq
891
+ %shuffle1 = shufflevector <64 x i8 > %a0 , <64 x i8 > poison, <64 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 undef , i32 undef , i32 undef , i32 undef , i32 12 , i32 13 , i32 14 , i32 15 , i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 undef , i32 undef , i32 undef , i32 undef , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 , i32 32 , i32 33 , i32 34 , i32 35 , i32 undef , i32 undef , i32 undef , i32 undef , i32 36 , i32 37 , i32 38 , i32 39 , i32 40 , i32 41 , i32 42 , i32 43 , i32 44 , i32 45 , i32 46 , i32 47 , i32 undef , i32 undef , i32 undef , i32 undef >
892
+ %shuffle2 = shufflevector <64 x i8 > %shuffle1 , <64 x i8 > poison, <64 x i32 > <i32 1 , i32 0 , i32 2 , i32 1 , i32 4 , i32 3 , i32 5 , i32 4 , i32 7 , i32 6 , i32 8 , i32 7 , i32 10 , i32 9 , i32 11 , i32 10 , i32 17 , i32 16 , i32 18 , i32 17 , i32 20 , i32 19 , i32 21 , i32 20 , i32 23 , i32 22 , i32 24 , i32 23 , i32 26 , i32 25 , i32 27 , i32 26 , i32 33 , i32 32 , i32 34 , i32 33 , i32 36 , i32 35 , i32 37 , i32 36 , i32 39 , i32 38 , i32 40 , i32 39 , i32 42 , i32 41 , i32 43 , i32 42 , i32 49 , i32 48 , i32 50 , i32 49 , i32 52 , i32 51 , i32 53 , i32 52 , i32 55 , i32 54 , i32 56 , i32 55 , i32 58 , i32 57 , i32 59 , i32 58 >
893
+ ret <64 x i8 > %shuffle2
894
+ }
895
+
896
+ define void @PR54562_mem (<64 x i8 >* %src , <64 x i8 >* %dst ) {
897
+ ; AVX512F-LABEL: PR54562_mem:
898
+ ; AVX512F: # %bb.0:
899
+ ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0
900
+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
901
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
902
+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
903
+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
904
+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
905
+ ; AVX512F-NEXT: vmovdqa %xmm0, 48(%rsi)
906
+ ; AVX512F-NEXT: vmovdqa %xmm1, 32(%rsi)
907
+ ; AVX512F-NEXT: vmovdqa %ymm2, (%rsi)
908
+ ; AVX512F-NEXT: vzeroupper
909
+ ; AVX512F-NEXT: retq
910
+ ;
911
+ ; AVX512BW-LABEL: PR54562_mem:
912
+ ; AVX512BW: # %bb.0:
913
+ ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,1,2]
914
+ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
915
+ ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
916
+ ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
917
+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
918
+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
919
+ ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
920
+ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
921
+ ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
922
+ ; AVX512BW-NEXT: vzeroupper
923
+ ; AVX512BW-NEXT: retq
924
+ ;
925
+ ; AVX512DQ-LABEL: PR54562_mem:
926
+ ; AVX512DQ: # %bb.0:
927
+ ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0
928
+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
929
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
930
+ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
931
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
932
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
933
+ ; AVX512DQ-NEXT: vmovdqa %xmm0, 48(%rsi)
934
+ ; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rsi)
935
+ ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi)
936
+ ; AVX512DQ-NEXT: vzeroupper
937
+ ; AVX512DQ-NEXT: retq
938
+ ;
939
+ ; AVX512VBMI-LABEL: PR54562_mem:
940
+ ; AVX512VBMI: # %bb.0:
941
+ ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,13,12,14,13,16,15,17,16,19,18,20,19,22,21,23,22,25,24,26,25,28,27,29,28,31,30,32,31,34,33,35,34,37,36,38,37,40,39,41,40,43,42,44,43,46,45,47,46]
942
+ ; AVX512VBMI-NEXT: vpermb (%rdi), %zmm0, %zmm0
943
+ ; AVX512VBMI-NEXT: vmovdqa64 %zmm0, (%rsi)
944
+ ; AVX512VBMI-NEXT: vzeroupper
945
+ ; AVX512VBMI-NEXT: retq
946
+ %load = load <64 x i8 >, <64 x i8 >* %src , align 512
947
+ %shuffle1 = shufflevector <64 x i8 > %load , <64 x i8 > poison, <64 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 undef , i32 undef , i32 undef , i32 undef , i32 12 , i32 13 , i32 14 , i32 15 , i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 undef , i32 undef , i32 undef , i32 undef , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 , i32 32 , i32 33 , i32 34 , i32 35 , i32 undef , i32 undef , i32 undef , i32 undef , i32 36 , i32 37 , i32 38 , i32 39 , i32 40 , i32 41 , i32 42 , i32 43 , i32 44 , i32 45 , i32 46 , i32 47 , i32 undef , i32 undef , i32 undef , i32 undef >
948
+ %shuffle2 = shufflevector <64 x i8 > %shuffle1 , <64 x i8 > poison, <64 x i32 > <i32 1 , i32 0 , i32 2 , i32 1 , i32 4 , i32 3 , i32 5 , i32 4 , i32 7 , i32 6 , i32 8 , i32 7 , i32 10 , i32 9 , i32 11 , i32 10 , i32 17 , i32 16 , i32 18 , i32 17 , i32 20 , i32 19 , i32 21 , i32 20 , i32 23 , i32 22 , i32 24 , i32 23 , i32 26 , i32 25 , i32 27 , i32 26 , i32 33 , i32 32 , i32 34 , i32 33 , i32 36 , i32 35 , i32 37 , i32 36 , i32 39 , i32 38 , i32 40 , i32 39 , i32 42 , i32 41 , i32 43 , i32 42 , i32 49 , i32 48 , i32 50 , i32 49 , i32 52 , i32 51 , i32 53 , i32 52 , i32 55 , i32 54 , i32 56 , i32 55 , i32 58 , i32 57 , i32 59 , i32 58 >
949
+ store <64 x i8 > %shuffle2 , <64 x i8 >* %dst , align 512
950
+ ret void
951
+ }
0 commit comments