@@ -972,24 +972,47 @@ ret void
972
972
}
973
973
974
974
define void @interleaved_store_vf16_i8_stride3 (<16 x i8 > %a , <16 x i8 > %b , <16 x i8 > %c , <48 x i8 >* %p ) {
975
- ; AVX-LABEL: interleaved_store_vf16_i8_stride3:
976
- ; AVX: # %bb.0:
977
- ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
978
- ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
979
- ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
980
- ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
981
- ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
982
- ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
983
- ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
984
- ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
985
- ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
986
- ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
987
- ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
988
- ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
989
- ; AVX-NEXT: vmovdqu %xmm0, 16(%rdi)
990
- ; AVX-NEXT: vmovdqu %xmm1, (%rdi)
991
- ; AVX-NEXT: vmovdqu %xmm2, 32(%rdi)
992
- ; AVX-NEXT: retq
975
+ ; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride3:
976
+ ; AVX1OR2: # %bb.0:
977
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
978
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
979
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
980
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
981
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
982
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
983
+ ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
984
+ ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
985
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
986
+ ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
987
+ ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
988
+ ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
989
+ ; AVX1OR2-NEXT: vmovdqu %xmm0, 16(%rdi)
990
+ ; AVX1OR2-NEXT: vmovdqu %xmm1, (%rdi)
991
+ ; AVX1OR2-NEXT: vmovdqu %xmm2, 32(%rdi)
992
+ ; AVX1OR2-NEXT: retq
993
+ ;
994
+ ; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
995
+ ; AVX512: # %bb.0:
996
+ ; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
997
+ ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
998
+ ; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
999
+ ; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1000
+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1001
+ ; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1002
+ ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1003
+ ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1004
+ ; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
1005
+ ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
1006
+ ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
1007
+ ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1008
+ ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
1009
+ ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1010
+ ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1011
+ ; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1012
+ ; AVX512-NEXT: vmovdqu %xmm0, 32(%rdi)
1013
+ ; AVX512-NEXT: vmovdqu %ymm1, (%rdi)
1014
+ ; AVX512-NEXT: vzeroupper
1015
+ ; AVX512-NEXT: retq
993
1016
%1 = shufflevector <16 x i8 > %a , <16 x i8 > %b , <32 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 , i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 >
994
1017
%2 = shufflevector <16 x i8 > %c , <16 x i8 > undef , <32 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef , i32 undef >
995
1018
%interleaved.vec = shufflevector <32 x i8 > %1 , <32 x i8 > %2 , <48 x i32 > <i32 0 , i32 16 , i32 32 , i32 1 , i32 17 , i32 33 , i32 2 , i32 18 , i32 34 , i32 3 , i32 19 , i32 35 , i32 4 , i32 20 , i32 36 , i32 5 , i32 21 , i32 37 , i32 6 , i32 22 , i32 38 , i32 7 , i32 23 , i32 39 , i32 8 , i32 24 , i32 40 , i32 9 , i32 25 , i32 41 , i32 10 , i32 26 , i32 42 , i32 11 , i32 27 , i32 43 , i32 12 , i32 28 , i32 44 , i32 13 , i32 29 , i32 45 , i32 14 , i32 30 , i32 46 , i32 15 , i32 31 , i32 47 >
@@ -1069,13 +1092,13 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x
1069
1092
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1070
1093
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1071
1094
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1072
- ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1073
- ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1074
1095
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1075
- ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1076
1096
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1077
- ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1097
+ ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1098
+ ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
1099
+ ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1078
1100
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1101
+ ; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1
1079
1102
; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
1080
1103
; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
1081
1104
; AVX512-NEXT: vzeroupper
@@ -1209,26 +1232,25 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
1209
1232
; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1210
1233
; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52]
1211
1234
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1212
- ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1213
- ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1214
- ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1215
- ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5
1216
- ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3]
1217
- ; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6
1235
+ ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1236
+ ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3]
1218
1237
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1219
1238
; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1220
- ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7
1221
- ; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7
1239
+ ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6
1222
1240
; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1223
1241
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1224
- ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1225
1242
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1226
- ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1227
- ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm2
1228
- ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1243
+ ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2
1244
+ ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1245
+ ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1246
+ ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4
1247
+ ; AVX512-NEXT: vpshufb %zmm4, %zmm2, %zmm2
1248
+ ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
1249
+ ; AVX512-NEXT: vpshufb %zmm4, %zmm5, %zmm4
1229
1250
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1251
+ ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
1230
1252
; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi)
1231
- ; AVX512-NEXT: vmovdqu64 %zmm3 , 64(%rdi)
1253
+ ; AVX512-NEXT: vmovdqu64 %zmm4 , 64(%rdi)
1232
1254
; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi)
1233
1255
; AVX512-NEXT: vzeroupper
1234
1256
; AVX512-NEXT: retq
0 commit comments