@@ -14447,7 +14447,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
14447
14447
; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
14448
14448
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
14449
14449
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14450
- ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10
14451
14450
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
14452
14451
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14453
14452
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
@@ -14483,7 +14482,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
14483
14482
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
14484
14483
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14485
14484
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm11
14486
- ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10 [0,1],ymm9[2],ymm10 [3,4],ymm9[5],ymm10 [6,7]
14485
+ ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15 [0,1],ymm9[2],ymm15 [3,4],ymm9[5],ymm15 [6,7]
14487
14486
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
14488
14487
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
14489
14488
; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
@@ -14516,7 +14515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
14516
14515
; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
14517
14516
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0
14518
14517
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14519
- ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10 [0,1,2],ymm11[3],ymm10 [4,5],ymm11[6],ymm10 [7]
14518
+ ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15 [0,1,2],ymm11[3],ymm15 [4,5],ymm11[6],ymm15 [7]
14520
14519
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
14521
14520
; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
14522
14521
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
@@ -14530,8 +14529,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
14530
14529
; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
14531
14530
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
14532
14531
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14533
- ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10 [0,1],ymm11[2,3],ymm10 [4,5],ymm11[6,7]
14534
- ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10 , %ymm18
14532
+ ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15 [0,1],ymm11[2,3],ymm15 [4,5],ymm11[6,7]
14533
+ ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15 , %ymm18
14535
14534
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25
14536
14535
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
14537
14536
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
@@ -14738,7 +14737,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
14738
14737
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm27
14739
14738
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
14740
14739
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19
14741
- ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16
14742
14740
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14743
14741
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
14744
14742
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
@@ -14747,7 +14745,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
14747
14745
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
14748
14746
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
14749
14747
; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
14750
- ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm1
14751
14748
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
14752
14749
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
14753
14750
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
@@ -14823,14 +14820,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
14823
14820
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12
14824
14821
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
14825
14822
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14826
- ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm1 [2,3],ymm14[4,5],ymm1 [6,7]
14827
- ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1 , %ymm13
14828
- ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1 , {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14823
+ ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3 [2,3],ymm14[4,5],ymm3 [6,7]
14824
+ ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3 , %ymm13
14825
+ ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3 , {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14829
14826
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
14830
14827
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
14831
14828
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5
14832
- ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16 , %ymm4
14833
- ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4 [0],ymm5[1],ymm4 [2,3],ymm5[4],ymm4 [5,6,7]
14829
+ ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1 , %ymm4
14830
+ ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1 [0],ymm5[1],ymm1 [2,3],ymm5[4],ymm1 [5,6,7]
14834
14831
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm10
14835
14832
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
14836
14833
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
0 commit comments