Skip to content

Commit 30a01bc

Browse files
committed
[X86] Fold concat(pshufb(x,y),pshufb(z,w)) -> pshufb(concat(x,z),concat(y,w))
1 parent f67e3f6 commit 30a01bc

File tree

4 files changed

+127
-78
lines changed

4 files changed

+127
-78
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53329,6 +53329,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5332953329
DAG.getTargetConstant(Idx, DL, MVT::i8));
5333053330
}
5333153331
break;
53332+
case X86ISD::PSHUFB:
53333+
if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
53334+
(VT.is512BitVector() && Subtarget.useBWIRegs()))) {
53335+
return DAG.getNode(Op0.getOpcode(), DL, VT,
53336+
ConcatSubOperand(VT, Ops, 0),
53337+
ConcatSubOperand(VT, Ops, 1));
53338+
}
53339+
break;
5333253340
case X86ISD::VPERMV3:
5333353341
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
5333453342
MVT OpVT = Op0.getSimpleValueType();
@@ -53464,6 +53472,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5346453472
}
5346553473
}
5346653474

53475+
// Attempt to fold target constant loads.
53476+
if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
53477+
SmallVector<APInt> EltBits;
53478+
APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());
53479+
for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
53480+
APInt OpUndefElts;
53481+
SmallVector<APInt> OpEltBits;
53482+
if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
53483+
OpEltBits, true, false))
53484+
break;
53485+
EltBits.append(OpEltBits);
53486+
UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
53487+
}
53488+
if (EltBits.size() == VT.getVectorNumElements())
53489+
return getConstVector(EltBits, UndefElts, VT, DAG, DL);
53490+
}
53491+
5346753492
return SDValue();
5346853493
}
5346953494

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -314,21 +314,23 @@ define void @store_i8_stride3_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr
314314
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
315315
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
316316
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
317-
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
318-
; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
319-
; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
320-
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
321-
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
322-
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
323-
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
324-
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
325-
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
326-
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
327-
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
328-
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
329-
; AVX512-NEXT: vmovdqa %xmm0, 16(%rcx)
330-
; AVX512-NEXT: vmovdqa %xmm1, (%rcx)
331-
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
317+
; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
318+
; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
319+
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
320+
; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
321+
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
322+
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
323+
; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
324+
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
325+
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
326+
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
327+
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
328+
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
329+
; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
330+
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
331+
; AVX512-NEXT: vmovdqa %xmm0, 32(%rcx)
332+
; AVX512-NEXT: vmovdqa %ymm1, (%rcx)
333+
; AVX512-NEXT: vzeroupper
332334
; AVX512-NEXT: retq
333335
%in.vec0 = load <16 x i8>, <16 x i8>* %in.vecptr0, align 32
334336
%in.vec1 = load <16 x i8>, <16 x i8>* %in.vecptr1, align 32
@@ -544,13 +546,13 @@ define void @store_i8_stride3_vf32(<32 x i8>* %in.vecptr0, <32 x i8>* %in.vecptr
544546
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
545547
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
546548
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
547-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
548-
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
549549
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
550-
; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
551550
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
552-
; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
551+
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
552+
; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
553+
; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
553554
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
555+
; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1
554556
; AVX512-NEXT: vmovdqa %ymm0, 64(%rcx)
555557
; AVX512-NEXT: vmovdqu64 %zmm1, (%rcx)
556558
; AVX512-NEXT: vzeroupper

llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1493,13 +1493,13 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
14931493
; AVX512F-LABEL: PR54562_ref:
14941494
; AVX512F: # %bb.0:
14951495
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
1496-
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
1497-
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
1496+
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1497+
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1498+
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm3
14981499
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
1499-
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
1500-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
1501-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1502-
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1500+
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
1501+
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1502+
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
15031503
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
15041504
; AVX512F-NEXT: retq
15051505
;
@@ -1513,13 +1513,13 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
15131513
; AVX512DQ-LABEL: PR54562_ref:
15141514
; AVX512DQ: # %bb.0:
15151515
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
1516-
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
1517-
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
1516+
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1517+
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1518+
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
15181519
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
1519-
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
1520-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
1521-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1522-
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1520+
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
1521+
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1522+
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
15231523
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
15241524
; AVX512DQ-NEXT: retq
15251525
;
@@ -1538,13 +1538,13 @@ define void @PR54562_mem(<64 x i8>* %src, <64 x i8>* %dst) {
15381538
; AVX512F: # %bb.0:
15391539
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0
15401540
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1541-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
1541+
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1542+
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1543+
; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
15421544
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
1543-
; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
1544-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1545-
; AVX512F-NEXT: vmovdqa %xmm0, 48(%rsi)
1546-
; AVX512F-NEXT: vmovdqa %xmm1, 32(%rsi)
1547-
; AVX512F-NEXT: vmovdqa %ymm2, (%rsi)
1545+
; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm1
1546+
; AVX512F-NEXT: vmovdqa %ymm1, (%rsi)
1547+
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rsi)
15481548
; AVX512F-NEXT: vzeroupper
15491549
; AVX512F-NEXT: retq
15501550
;
@@ -1561,13 +1561,13 @@ define void @PR54562_mem(<64 x i8>* %src, <64 x i8>* %dst) {
15611561
; AVX512DQ: # %bb.0:
15621562
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0
15631563
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1564-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
1564+
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1565+
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1566+
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
15651567
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
1566-
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
1567-
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
1568-
; AVX512DQ-NEXT: vmovdqa %xmm0, 48(%rsi)
1569-
; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rsi)
1570-
; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi)
1568+
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
1569+
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rsi)
1570+
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rsi)
15711571
; AVX512DQ-NEXT: vzeroupper
15721572
; AVX512DQ-NEXT: retq
15731573
;

llvm/test/CodeGen/X86/x86-interleaved-access.ll

Lines changed: 57 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -972,24 +972,47 @@ ret void
972972
}
973973

974974
define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
975-
; AVX-LABEL: interleaved_store_vf16_i8_stride3:
976-
; AVX: # %bb.0:
977-
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
978-
; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
979-
; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
980-
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
981-
; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
982-
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
983-
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
984-
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
985-
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
986-
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
987-
; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
988-
; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
989-
; AVX-NEXT: vmovdqu %xmm0, 16(%rdi)
990-
; AVX-NEXT: vmovdqu %xmm1, (%rdi)
991-
; AVX-NEXT: vmovdqu %xmm2, 32(%rdi)
992-
; AVX-NEXT: retq
975+
; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride3:
976+
; AVX1OR2: # %bb.0:
977+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
978+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
979+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
980+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
981+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
982+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
983+
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
984+
; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
985+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
986+
; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
987+
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
988+
; AVX1OR2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
989+
; AVX1OR2-NEXT: vmovdqu %xmm0, 16(%rdi)
990+
; AVX1OR2-NEXT: vmovdqu %xmm1, (%rdi)
991+
; AVX1OR2-NEXT: vmovdqu %xmm2, 32(%rdi)
992+
; AVX1OR2-NEXT: retq
993+
;
994+
; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
995+
; AVX512: # %bb.0:
996+
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
997+
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
998+
; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
999+
; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1000+
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1001+
; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1002+
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1003+
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1004+
; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
1005+
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
1006+
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
1007+
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1008+
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
1009+
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1010+
; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1011+
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1012+
; AVX512-NEXT: vmovdqu %xmm0, 32(%rdi)
1013+
; AVX512-NEXT: vmovdqu %ymm1, (%rdi)
1014+
; AVX512-NEXT: vzeroupper
1015+
; AVX512-NEXT: retq
9931016
%1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
9941017
%2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
9951018
%interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
@@ -1069,13 +1092,13 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x
10691092
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
10701093
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
10711094
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1072-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1073-
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
10741095
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1075-
; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
10761096
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1077-
; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1097+
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1098+
; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
1099+
; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
10781100
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1101+
; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1
10791102
; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
10801103
; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
10811104
; AVX512-NEXT: vzeroupper
@@ -1209,26 +1232,25 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
12091232
; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
12101233
; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52]
12111234
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
1212-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1213-
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
1214-
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1215-
; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5
1216-
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3]
1217-
; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6
1235+
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1236+
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3]
12181237
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
12191238
; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1220-
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7
1221-
; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7
1239+
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6
12221240
; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
12231241
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1224-
; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
12251242
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1226-
; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1227-
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm2
1228-
; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1243+
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2
1244+
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1245+
; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1246+
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4
1247+
; AVX512-NEXT: vpshufb %zmm4, %zmm2, %zmm2
1248+
; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
1249+
; AVX512-NEXT: vpshufb %zmm4, %zmm5, %zmm4
12291250
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1251+
; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
12301252
; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi)
1231-
; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi)
1253+
; AVX512-NEXT: vmovdqu64 %zmm4, 64(%rdi)
12321254
; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi)
12331255
; AVX512-NEXT: vzeroupper
12341256
; AVX512-NEXT: retq

0 commit comments

Comments
 (0)