Skip to content

Commit 217304a

Browse files
[X86] Use X86FixupInstTunings to select between (V)MOVSS/D and (V)BLENDPS/D (#143895)
Fix #142588 Following @RKSimon’s suggestion, the transformation applies only when the blend mask is exactly 1, indicating that the instruction behaves like a move. Additionally, the conversion will only be performed when optimizing for size or when the target prefers MOVSS/D over BLENDPS/D for performance reasons. The switch-case instructions were identified with GPT O.O . Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
1 parent 1a4cf1d commit 217304a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+272
-229
lines changed

llvm/lib/Target/X86/X86FixupInstTuning.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,22 @@ bool X86FixupInstTuningPass::processInstruction(
222222
return ProcessUNPCKToIntDomain(NewOpc);
223223
};
224224

225+
auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
226+
if (MI.getOperand(NumOperands - 1).getImm() != 1)
227+
return false;
228+
bool Force = MF.getFunction().hasOptSize();
229+
if (!Force && !NewOpcPreferable(MovOpc))
230+
return false;
231+
MI.setDesc(TII->get(MovOpc));
232+
MI.removeOperand(NumOperands - 1);
233+
return true;
234+
};
235+
225236
switch (Opc) {
237+
case X86::VBLENDPSrri:
238+
return ProcessBLENDToMOV(X86::VMOVSSrr);
239+
case X86::VBLENDPDrri:
240+
return ProcessBLENDToMOV(X86::VMOVSDrr);
226241
case X86::VPERMILPDri:
227242
return ProcessVPERMILPDri(X86::VSHUFPDrri);
228243
case X86::VPERMILPDYri:

llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define void @endless_loop() {
1111
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
1212
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
1313
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
14-
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
14+
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
1515
; AVX1-NEXT: vmovaps %ymm0, (%eax)
1616
; AVX1-NEXT: vmovaps %ymm1, (%eax)
1717
; AVX1-NEXT: vzeroupper
@@ -21,7 +21,7 @@ define void @endless_loop() {
2121
; AVX2: # %bb.0: # %entry
2222
; AVX2-NEXT: vbroadcastss (%eax), %xmm0
2323
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
24-
; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
24+
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2525
; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
2626
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
2727
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]

llvm/test/CodeGen/X86/avx-insertelt.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float
9494
; AVX-LABEL: insert_f32_firstelt_of_high_subvector:
9595
; AVX: # %bb.0:
9696
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
97-
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
97+
; AVX-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
9898
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
9999
; AVX-NEXT: retq
100100
;
@@ -202,9 +202,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
202202
define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) {
203203
; AVX-LABEL: insert_f32_firstelts:
204204
; AVX: # %bb.0:
205-
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
205+
; AVX-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
206206
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
207-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
207+
; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
208208
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
209209
; AVX-NEXT: retq
210210
;

llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1843,7 +1843,7 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
18431843
; X86-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
18441844
; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
18451845
; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
1846-
; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1846+
; X86-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
18471847
; X86-NEXT: retl
18481848
;
18491849
; X64-LABEL: test_mm_cvtu64_sd:
@@ -1891,7 +1891,7 @@ define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
18911891
; X86-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
18921892
; X86-NEXT: fstps {{[0-9]+}}(%esp)
18931893
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1894-
; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1894+
; X86-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
18951895
; X86-NEXT: movl %ebp, %esp
18961896
; X86-NEXT: popl %ebp
18971897
; X86-NEXT: .cfi_def_cfa %esp, 4

llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10483,7 +10483,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
1048310483
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
1048410484
; CHECK: ## %bb.0:
1048510485
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
10486-
; CHECK-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
10486+
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
1048710487
; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
1048810488
; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
1048910489
%q = load float, ptr %ptr_b

llvm/test/CodeGen/X86/avx512-intrinsics.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6505,7 +6505,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
65056505
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
65066506
; CHECK: # %bb.0:
65076507
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
6508-
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6508+
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
65096509
; CHECK-NEXT: ret{{[l|q]}}
65106510
%q = load float, ptr %ptr_b
65116511
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0

llvm/test/CodeGen/X86/avx512copy-intrinsics.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
1111
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
1212
; NOAVX512MOVZXC: # %bb.0:
1313
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
14-
; NOAVX512MOVZXC-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
14+
; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
1515
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
1616
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
1717
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>

llvm/test/CodeGen/X86/build-vector-512.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) {
578578
; AVX-32-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,17,0,0]
579579
; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
580580
; AVX-32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
581-
; AVX-32-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
581+
; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
582582
; AVX-32-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
583583
; AVX-32-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm2[0]
584584
; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
@@ -626,7 +626,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
626626
; AVX-32-NEXT: vbroadcastss (%ecx), %xmm1
627627
; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
628628
; AVX-32-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
629-
; AVX-32-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
629+
; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
630630
; AVX-32-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
631631
; AVX-32-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
632632
; AVX-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
@@ -640,7 +640,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
640640
; AVX-64-NEXT: vbroadcastss (%rdi), %xmm1
641641
; AVX-64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
642642
; AVX-64-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
643-
; AVX-64-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
643+
; AVX-64-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
644644
; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
645645
; AVX-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
646646
; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]

llvm/test/CodeGen/X86/buildvec-extract.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
4242
; AVX-LABEL: extract0_i32_zext_insert0_i64_zero:
4343
; AVX: # %bb.0:
4444
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
45-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
45+
; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
4646
; AVX-NEXT: retq
4747
%e = extractelement <4 x i32> %x, i32 0
4848
%z = zext i32 %e to i64
@@ -85,7 +85,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
8585
; AVX: # %bb.0:
8686
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
8787
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
88-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
88+
; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
8989
; AVX-NEXT: retq
9090
%e = extractelement <4 x i32> %x, i32 1
9191
%z = zext i32 %e to i64
@@ -130,7 +130,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
130130
; AVX: # %bb.0:
131131
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
132132
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
133-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
133+
; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
134134
; AVX-NEXT: retq
135135
%e = extractelement <4 x i32> %x, i32 2
136136
%z = zext i32 %e to i64

llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
5151
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
5252
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
5353
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
54-
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
54+
; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
5555
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
5656
; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi)
5757
; AVX512-NEXT: retq
@@ -149,7 +149,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
149149
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
150150
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
151151
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
152-
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
152+
; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
153153
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
154154
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
155155
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
@@ -235,12 +235,12 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
235235
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
236236
; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm2
237237
; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3
238-
; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
238+
; AVX512-NEXT: vmovss {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
239239
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
240240
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
241241
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
242242
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
243-
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
243+
; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
244244
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
245245
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
246246
; AVX512-NEXT: vmovd %xmm0, (%rdi)

0 commit comments

Comments
 (0)