Skip to content

Commit 6e86b7e

Browse files
authored
[AMDGPU] Do not replace SALU floating point multiply with VALU-only ldexp (#145048)
1 parent 7897191 commit 6e86b7e

File tree

2 files changed

+62
-36
lines changed

2 files changed

+62
-36
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15189,6 +15189,12 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
1518915189
EVT ScalarVT = VT.getScalarType();
1519015190
EVT IntVT = VT.changeElementType(MVT::i32);
1519115191

15192+
if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15193+
(ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15194+
// Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15195+
return SDValue();
15196+
}
15197+
1519215198
SDValue LHS = N->getOperand(0);
1519315199
SDValue RHS = N->getOperand(1);
1519415200

llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,34 @@
22
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
33
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
44

5+
; TODO: GlobalISel should avoid generating v_ldexp_f32.
56
define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
6-
; GFX12-LABEL: v_s_exp_f32:
7-
; GFX12: ; %bb.0:
8-
; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
9-
; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0
10-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
11-
; GFX12-NEXT: s_add_f32 s0, s0, s1
12-
; GFX12-NEXT: s_cselect_b32 s1, 0xffffffc0, 0
13-
; GFX12-NEXT: v_s_exp_f32 s0, s0
14-
; GFX12-NEXT: s_wait_alu 0xf1ff
15-
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
16-
; GFX12-NEXT: v_ldexp_f32 v0, s0, s1
17-
; GFX12-NEXT: ; return to shader part epilog
7+
; GFX12-SDAG-LABEL: v_s_exp_f32:
8+
; GFX12-SDAG: ; %bb.0:
9+
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
10+
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42800000, 0
11+
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
12+
; GFX12-SDAG-NEXT: s_add_f32 s0, s0, s1
13+
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0
14+
; GFX12-SDAG-NEXT: v_s_exp_f32 s0, s0
15+
; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
16+
; GFX12-SDAG-NEXT: s_mul_f32 s0, s0, s1
17+
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
18+
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
19+
; GFX12-SDAG-NEXT: ; return to shader part epilog
20+
;
21+
; GFX12-GISEL-LABEL: v_s_exp_f32:
22+
; GFX12-GISEL: ; %bb.0:
23+
; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
24+
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42800000, 0
25+
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
26+
; GFX12-GISEL-NEXT: s_add_f32 s0, s0, s1
27+
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0xffffffc0, 0
28+
; GFX12-GISEL-NEXT: v_s_exp_f32 s0, s0
29+
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
30+
; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
31+
; GFX12-GISEL-NEXT: v_ldexp_f32 v0, s0, s1
32+
; GFX12-GISEL-NEXT: ; return to shader part epilog
1833
%result = call float @llvm.exp2.f32(float %src)
1934
ret float %result
2035
}
@@ -59,14 +74,15 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
5974
; GFX12-SDAG-LABEL: v_s_log_f32:
6075
; GFX12-SDAG: ; %bb.0:
6176
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0x800000
62-
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 32, 0
63-
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
64-
; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, s1
65-
; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
66-
; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0
77+
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
78+
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
79+
; GFX12-SDAG-NEXT: s_mul_f32 s0, s0, s1
80+
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0
81+
; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0
82+
; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
83+
; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1
6784
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
68-
; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
69-
; GFX12-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
85+
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
7086
; GFX12-SDAG-NEXT: ; return to shader part epilog
7187
;
7288
; GFX12-GISEL-LABEL: v_s_log_f32:
@@ -147,7 +163,7 @@ define amdgpu_cs half @v_s_rcp_f16(half inreg %src) {
147163
ret half %result
148164
}
149165

150-
; TODO-GFX12: GlobalISel should generate v_s_rsq.
166+
; TODO: GlobalISel should generate v_s_rsq.
151167
define amdgpu_cs float @v_s_rsq_f32(float inreg %src) {
152168
; GFX12-SDAG-LABEL: v_s_rsq_f32:
153169
; GFX12-SDAG: ; %bb.0:
@@ -184,7 +200,7 @@ define amdgpu_cs half @v_s_rsq_f16(half inreg %src) {
184200
ret half %result
185201
}
186202

187-
; TODO-GFX12: Should not use any VALU instructions.
203+
; TODO: Should avoid generating v_cmp_class_f32.
188204
define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
189205
; GFX12-SDAG-LABEL: v_s_sqrt_f32:
190206
; GFX12-SDAG: ; %bb.0:
@@ -298,16 +314,18 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src) {
298314
define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
299315
; GFX12-SDAG-LABEL: srcmods_abs_f32:
300316
; GFX12-SDAG: ; %bb.0:
301-
; GFX12-SDAG-NEXT: s_and_b32 s1, s0, 0x7fffffff
317+
; GFX12-SDAG-NEXT: s_bitset0_b32 s0, 31
302318
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
303-
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s1, 0x800000
304-
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 32, 0
305-
; GFX12-SDAG-NEXT: v_ldexp_f32 v0, |s0|, s1
306-
; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
307-
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
308-
; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0
319+
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0x800000
320+
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
321+
; GFX12-SDAG-NEXT: s_mul_f32 s0, s0, s1
322+
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0
323+
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
324+
; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0
325+
; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1
309326
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
310-
; GFX12-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
327+
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
328+
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
311329
; GFX12-SDAG-NEXT: ; return to shader part epilog
312330
;
313331
; GFX12-GISEL-LABEL: srcmods_abs_f32:
@@ -333,15 +351,17 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
333351
define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
334352
; GFX12-SDAG-LABEL: srcmods_neg_f32:
335353
; GFX12-SDAG: ; %bb.0:
354+
; GFX12-SDAG-NEXT: s_xor_b32 s1, s0, 0x80000000
336355
; GFX12-SDAG-NEXT: s_cmp_gt_f32 s0, 0x80800000
337-
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 32, 0
338-
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
339-
; GFX12-SDAG-NEXT: v_ldexp_f32 v0, -s0, s1
340-
; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0
341-
; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0
356+
; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
357+
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
358+
; GFX12-SDAG-NEXT: s_mul_f32 s0, s1, s0
359+
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0
360+
; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0
361+
; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
362+
; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1
342363
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
343-
; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
344-
; GFX12-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0
364+
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
345365
; GFX12-SDAG-NEXT: ; return to shader part epilog
346366
;
347367
; GFX12-GISEL-LABEL: srcmods_neg_f32:

0 commit comments

Comments
 (0)