Skip to content

Commit 97ee923

Browse files
committed
[X86] lowerV64I8Shuffle - attempt to fold to SHUFFLE(ALIGNR(X,Y)) and OR(PSHUFB(X),PSHUFB(Y))
1 parent 5af1ca8 commit 97ee923

File tree

4 files changed

+60
-104
lines changed

4 files changed

+60
-104
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18721,6 +18721,20 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1872118721
Zeroable, Subtarget, DAG))
1872218722
return Blend;
1872318723

18724+
if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18725+
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18726+
// PALIGNR will be cheaper than the second PSHUFB+OR.
18727+
if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18728+
Mask, Subtarget, DAG))
18729+
return V;
18730+
18731+
// If we can't directly blend but can use PSHUFB, that will be better as it
18732+
// can both shuffle and set up the inefficient blend.
18733+
bool V1InUse, V2InUse;
18734+
return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18735+
DAG, V1InUse, V2InUse);
18736+
}
18737+
1872418738
// Try to simplify this by merging 128-bit lanes to enable a lane-based
1872518739
// shuffle.
1872618740
if (!V2.isUndef())

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll

Lines changed: 22 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -976,54 +976,28 @@ define void @load_i8_stride4_vf32(<128 x i8>* %in.vec, <32 x i8>* %out.vec0, <32
976976
;
977977
; AVX512-LABEL: load_i8_stride4_vf32:
978978
; AVX512: # %bb.0:
979-
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
980-
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
981-
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2
982-
; AVX512-NEXT: vmovdqa 96(%rdi), %ymm3
983-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
984-
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5
985-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
986-
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm7
987-
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7]
988-
; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm4
989-
; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm6
990-
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
991-
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
992-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,1,5,9,13]
993-
; AVX512-NEXT: vpermd %zmm4, %zmm5, %zmm4
994-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
995-
; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm7
996-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
997-
; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm9
998-
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
999-
; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm6
1000-
; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm8
1001-
; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
1002-
; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
1003-
; AVX512-NEXT: vpermd %zmm6, %zmm5, %zmm6
1004-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
1005-
; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm8
1006-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
1007-
; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm10
1008-
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7]
1009-
; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm7
1010-
; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm9
1011-
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
1012-
; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
1013-
; AVX512-NEXT: vpermd %zmm7, %zmm5, %zmm7
1014-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
1015-
; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3
1016-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
1017-
; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm1
1018-
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7]
1019-
; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm2
1020-
; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0
1021-
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
1022-
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1023-
; AVX512-NEXT: vpermd %zmm0, %zmm5, %zmm0
1024-
; AVX512-NEXT: vmovdqa %ymm4, (%rsi)
1025-
; AVX512-NEXT: vmovdqa %ymm6, (%rdx)
1026-
; AVX512-NEXT: vmovdqa %ymm7, (%rcx)
979+
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13]
980+
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1
981+
; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm2
982+
; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u]
983+
; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm1[0,4,8,12],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
984+
; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3
985+
; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm3
986+
; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm2[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[49,53,57,61,u,u,u,u,u,u,u,u]
987+
; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm1[1,5,9,13],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
988+
; AVX512-NEXT: vporq %zmm4, %zmm5, %zmm4
989+
; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm4
990+
; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm2[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[50,54,58,62,u,u,u,u,u,u,u,u]
991+
; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm1[2,6,10,14],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
992+
; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5
993+
; AVX512-NEXT: vpermd %zmm5, %zmm0, %zmm5
994+
; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[51,55,59,63,u,u,u,u,u,u,u,u]
995+
; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,7,11,15],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
996+
; AVX512-NEXT: vporq %zmm2, %zmm1, %zmm1
997+
; AVX512-NEXT: vpermd %zmm1, %zmm0, %zmm0
998+
; AVX512-NEXT: vmovdqa %ymm3, (%rsi)
999+
; AVX512-NEXT: vmovdqa %ymm4, (%rdx)
1000+
; AVX512-NEXT: vmovdqa %ymm5, (%rcx)
10271001
; AVX512-NEXT: vmovdqa %ymm0, (%r8)
10281002
; AVX512-NEXT: vzeroupper
10291003
; AVX512-NEXT: retq

llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -562,16 +562,10 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
562562
;
563563
; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
564564
; AVX512BW: # %bb.0:
565-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
565+
; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1
566566
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
567-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
568-
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
569-
; AVX512BW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
570-
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
571-
; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
572-
; AVX512BW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
573-
; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
574-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
567+
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zmm0[13],zero,zmm0[11],zero,zmm0[9],zero,zmm0[7],zero,zmm0[5],zero,zmm0[3],zero,zmm0[1],zero,zmm0[31],zero,zmm0[29],zero,zmm0[27],zero,zmm0[25],zero,zmm0[23],zero,zmm0[21],zero,zmm0[19],zero,zmm0[17],zero,zmm0[47],zero,zmm0[45],zero,zmm0[43],zero,zmm0[41],zero,zmm0[39],zero,zmm0[37],zero,zmm0[35],zero,zmm0[33],zero,zmm0[63],zero,zmm0[61],zero,zmm0[59],zero,zmm0[57],zero,zmm0[55],zero,zmm0[53],zero,zmm0[51],zero,zmm0[49],zero
568+
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
575569
; AVX512BW-NEXT: retq
576570
;
577571
; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:

llvm/test/CodeGen/X86/x86-interleaved-access.ll

Lines changed: 21 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -762,53 +762,27 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
762762
;
763763
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
764764
; AVX512: # %bb.0:
765-
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
766-
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2
767-
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
768-
; AVX512-NEXT: vmovdqa 96(%rdi), %ymm3
769-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
770-
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5
771-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
772-
; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm7
773-
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7]
774-
; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm4
775-
; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm6
776-
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
777-
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
778-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,1,5,9,13]
779-
; AVX512-NEXT: vpermd %zmm4, %zmm5, %zmm4
780-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
781-
; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm7
782-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
783-
; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm9
784-
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
785-
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm6
786-
; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm8
787-
; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
788-
; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
789-
; AVX512-NEXT: vpermd %zmm6, %zmm5, %zmm6
790-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
791-
; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm8
792-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
793-
; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm10
794-
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7]
795-
; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7
796-
; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm9
797-
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
798-
; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
799-
; AVX512-NEXT: vpermd %zmm7, %zmm5, %zmm7
800-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
801-
; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3
802-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
803-
; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
804-
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
805-
; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1
806-
; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0
807-
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
808-
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
809-
; AVX512-NEXT: vpermd %zmm0, %zmm5, %zmm0
810-
; AVX512-NEXT: vpcmpeqb %zmm6, %zmm4, %k0
811-
; AVX512-NEXT: vpcmpeqb %zmm0, %zmm7, %k1
765+
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13]
766+
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
767+
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
768+
; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u]
769+
; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm1[0,4,8,12],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
770+
; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3
771+
; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm3
772+
; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm2[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[49,53,57,61,u,u,u,u,u,u,u,u]
773+
; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm1[1,5,9,13],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
774+
; AVX512-NEXT: vporq %zmm4, %zmm5, %zmm4
775+
; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm4
776+
; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm2[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[50,54,58,62,u,u,u,u,u,u,u,u]
777+
; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm1[2,6,10,14],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
778+
; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5
779+
; AVX512-NEXT: vpermd %zmm5, %zmm0, %zmm5
780+
; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[51,55,59,63,u,u,u,u,u,u,u,u]
781+
; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,7,11,15],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
782+
; AVX512-NEXT: vporq %zmm2, %zmm1, %zmm1
783+
; AVX512-NEXT: vpermd %zmm1, %zmm0, %zmm0
784+
; AVX512-NEXT: vpcmpeqb %zmm4, %zmm3, %k0
785+
; AVX512-NEXT: vpcmpeqb %zmm0, %zmm5, %k1
812786
; AVX512-NEXT: kxnord %k1, %k0, %k0
813787
; AVX512-NEXT: vpmovm2b %k0, %zmm0
814788
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0

0 commit comments

Comments
 (0)