Skip to content

Commit 03cc50f

Browse files
authored
[X86] lowerShuffleAsSplitOrBlend - prefer splitting AVX1-only shuffles if the operands can be freely split (llvm#141055)
On an AVX1-only target, if both operands of a 256-bit shuffle can be freely split (concatenations or splats), then prefer to lower as 128-bit shuffles using splitAndLowerShuffle
1 parent c2892b0 commit 03cc50f

9 files changed

+216
-242
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15252,6 +15252,20 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
1525215252
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
1525315253
/*SimpleOnly*/ false);
1525415254

15255+
// Without AVX2, if we can freely split the subvectors then we're better off
15256+
// performing half width shuffles.
15257+
if (!Subtarget.hasAVX2()) {
15258+
SDValue BC1 = peekThroughBitcasts(V1);
15259+
SDValue BC2 = peekThroughBitcasts(V2);
15260+
bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15261+
DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15262+
bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15263+
DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15264+
if (SplatOrSplitV1 && SplatOrSplitV2)
15265+
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15266+
/*SimpleOnly*/ false);
15267+
}
15268+
1525515269
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
1525615270
// requires that the decomposed single-input shuffles don't end up here.
1525715271
return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,

llvm/test/CodeGen/X86/matrix-multiply.ll

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -281,20 +281,16 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
281281
; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3
282282
; AVX1-NEXT: vmulps %xmm3, %xmm6, %xmm6
283283
; AVX1-NEXT: vaddps %xmm6, %xmm0, %xmm0
284+
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,2],xmm0[0,1]
284285
; AVX1-NEXT: vmulss %xmm2, %xmm9, %xmm2
285-
; AVX1-NEXT: vmulss %xmm5, %xmm10, %xmm5
286-
; AVX1-NEXT: vaddss %xmm5, %xmm2, %xmm2
286+
; AVX1-NEXT: vmulss %xmm5, %xmm10, %xmm4
287+
; AVX1-NEXT: vaddss %xmm4, %xmm2, %xmm2
287288
; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3
288289
; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2
289290
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0]
290-
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3]
291-
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
292-
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
293-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
294-
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
295291
; AVX1-NEXT: vmovss %xmm2, 32(%rdi)
296-
; AVX1-NEXT: vmovaps %ymm0, (%rdi)
297-
; AVX1-NEXT: vzeroupper
292+
; AVX1-NEXT: vmovaps %xmm0, 16(%rdi)
293+
; AVX1-NEXT: vmovaps %xmm1, (%rdi)
298294
; AVX1-NEXT: retq
299295
;
300296
; AVX2-LABEL: test_mul3x3_f32:

llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -242,18 +242,19 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
242242
; AVX: # %bb.0:
243243
; AVX-NEXT: vmovaps (%rdi), %xmm0
244244
; AVX-NEXT: vmovaps (%rsi), %xmm1
245-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
246-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3
247-
; AVX-NEXT: vmovsldup {{.*#+}} ymm3 = ymm3[0,0,2,2,4,4,6,6]
248-
; AVX-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,u,u,1,5,u,u,6]
249-
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7]
250-
; AVX-NEXT: vbroadcastsd (%rdx), %ymm3
251-
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
245+
; AVX-NEXT: vmovaps (%rdx), %xmm2
246+
; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
247+
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
248+
; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
249+
; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
250+
; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,1]
251+
; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0],xmm4[3]
252+
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
252253
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
253-
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
254-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
254+
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3]
255+
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
255256
; AVX-NEXT: vmovaps %xmm0, 32(%rcx)
256-
; AVX-NEXT: vmovaps %ymm2, (%rcx)
257+
; AVX-NEXT: vmovaps %ymm3, (%rcx)
257258
; AVX-NEXT: vzeroupper
258259
; AVX-NEXT: retq
259260
;

llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -251,24 +251,22 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
251251
; AVX-NEXT: vmovaps (%rsi), %xmm1
252252
; AVX-NEXT: vmovaps (%rdx), %xmm2
253253
; AVX-NEXT: vmovaps (%rcx), %xmm3
254-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
255-
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5
256-
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
257-
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4]
258-
; AVX-NEXT: vmovddup {{.*#+}} ymm6 = ymm5[0,0,2,2]
259-
; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
260-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
261-
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7]
262-
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7]
263-
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
264-
; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
265-
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7]
266-
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7]
267-
; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1,0,3,2]
268-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
269-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
254+
; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[1],xmm1[1],zero,zero
255+
; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
256+
; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
257+
; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm3[0]
258+
; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
259+
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0,2]
260+
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
261+
; AVX-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
262+
; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm0[3,0]
263+
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,0],xmm5[2,3]
264+
; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
265+
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm3[2]
266+
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
267+
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
270268
; AVX-NEXT: vmovaps %ymm0, 32(%r8)
271-
; AVX-NEXT: vmovaps %ymm1, (%r8)
269+
; AVX-NEXT: vmovaps %ymm4, (%r8)
272270
; AVX-NEXT: vzeroupper
273271
; AVX-NEXT: retq
274272
;

llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -43,20 +43,18 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
4343
; AVX: # %bb.0:
4444
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
4545
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
46-
; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
47-
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
48-
; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
49-
; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
50-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
46+
; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
5147
; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
52-
; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5]
53-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
54-
; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,0,2,u,5,7,u]
55-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6],ymm2[7]
56-
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2
57-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
58-
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
59-
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
48+
; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
49+
; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
50+
; AVX-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
51+
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
52+
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm0[1,1]
53+
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,0]
54+
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
55+
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
56+
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
57+
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
6058
; AVX-NEXT: vmovlps %xmm1, 32(%r9)
6159
; AVX-NEXT: vmovaps %ymm0, (%r9)
6260
; AVX-NEXT: vzeroupper
@@ -340,32 +338,28 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
340338
; AVX-NEXT: vmovaps (%rsi), %xmm1
341339
; AVX-NEXT: vmovaps (%rdx), %xmm2
342340
; AVX-NEXT: vmovaps (%rcx), %xmm3
343-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
344-
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5
345-
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm6
346-
; AVX-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5]
347-
; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,2,3]
348-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
349-
; AVX-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
350-
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
351-
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6],ymm5[7]
352-
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1]
353-
; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm5[4,5,6,7]
354-
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3],ymm7[4],ymm4[5,6,7]
355-
; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,u,2,u,u,u,7]
356-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7]
357-
; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[1,u,u,u,6,u,u,u]
358-
; AVX-NEXT: vbroadcastss 8(%rcx), %ymm6
359-
; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5]
360-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
361-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7]
362-
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3]
363-
; AVX-NEXT: vbroadcastss 12(%rsi), %xmm2
364-
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
365-
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3]
341+
; AVX-NEXT: vmovaps (%r8), %xmm4
342+
; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm3[0]
343+
; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
344+
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0,2]
345+
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6
346+
; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm0[1],xmm1[1],zero
347+
; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm2[1]
348+
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
349+
; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7]
350+
; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
351+
; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3]
352+
; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2],xmm6[3]
353+
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm1[2]
354+
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm0[2,3]
355+
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
356+
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
357+
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
358+
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2],zero
359+
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
366360
; AVX-NEXT: vmovaps %xmm1, 64(%r9)
367-
; AVX-NEXT: vmovaps %ymm4, (%r9)
368361
; AVX-NEXT: vmovaps %ymm0, 32(%r9)
362+
; AVX-NEXT: vmovaps %ymm5, (%r9)
369363
; AVX-NEXT: vzeroupper
370364
; AVX-NEXT: retq
371365
;

llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll

Lines changed: 36 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -45,21 +45,17 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
4545
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4646
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
4747
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
48-
; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
48+
; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
49+
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
50+
; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
51+
; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
52+
; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
4953
; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
50-
; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
51-
; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
52-
; AVX-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
53-
; AVX-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
54-
; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
55-
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
56-
; AVX-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7]
57-
; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
58-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
59-
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3]
60-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
61-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
62-
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3]
54+
; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
55+
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm0[1,3]
56+
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
57+
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
58+
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
6359
; AVX-NEXT: vmovaps %xmm1, 32(%rax)
6460
; AVX-NEXT: vmovaps %ymm0, (%rax)
6561
; AVX-NEXT: vzeroupper
@@ -363,39 +359,36 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
363359
; AVX: # %bb.0:
364360
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
365361
; AVX-NEXT: vmovaps (%rdi), %xmm0
366-
; AVX-NEXT: vmovaps (%rsi), %xmm2
367-
; AVX-NEXT: vmovaps (%rdx), %xmm1
362+
; AVX-NEXT: vmovaps (%rsi), %xmm1
363+
; AVX-NEXT: vmovaps (%rdx), %xmm2
368364
; AVX-NEXT: vmovaps (%rcx), %xmm3
369365
; AVX-NEXT: vmovaps (%r8), %xmm4
370366
; AVX-NEXT: vmovaps (%r9), %xmm5
371-
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6
367+
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6
368+
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm7
369+
; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3]
372370
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm7
373-
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8
374-
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm9
375-
; AVX-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
376-
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11
377-
; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2]
378-
; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
379-
; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0]
380-
; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
381-
; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
382-
; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
383-
; AVX-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7]
384-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11
385-
; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6]
386-
; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
387-
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
388-
; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
389-
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
390-
; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
391-
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
392-
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
393-
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7]
394-
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6],ymm5[7]
395-
; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,7,5]
396-
; AVX-NEXT: vmovaps %ymm0, 64(%rax)
397-
; AVX-NEXT: vmovaps %ymm4, 32(%rax)
398-
; AVX-NEXT: vmovaps %ymm10, (%rax)
371+
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8
372+
; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,3],ymm7[3,3],ymm8[7,7],ymm7[7,7]
373+
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6],ymm6[7]
374+
; AVX-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,7,5]
375+
; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
376+
; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
377+
; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
378+
; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm3[0],xmm2[0]
379+
; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
380+
; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,0]
381+
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
382+
; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
383+
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm3[2]
384+
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
385+
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[1],xmm3[1],zero,zero
386+
; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
387+
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
388+
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
389+
; AVX-NEXT: vmovaps %ymm0, 32(%rax)
390+
; AVX-NEXT: vmovaps %ymm7, (%rax)
391+
; AVX-NEXT: vmovaps %ymm6, 64(%rax)
399392
; AVX-NEXT: vzeroupper
400393
; AVX-NEXT: retq
401394
;

0 commit comments

Comments
 (0)