Skip to content

Commit f5891bf

Browse files
committed
[GlobalISel] fdiv to fmul transform
This is a port of the SDAG DAGCombiner::combineRepeatedFPDivisors combine that looks like multiple fdiv operations with the same divisor and converts them to a single reciprocal fdiv and multiple fmuls. It is currently a fairly faithful port, with some additions to make sure that the newly created fdiv dominates all new uses. Compared to the SDAG version it also drops some logic about splat uses which assumes no vector fdivs and some logic about x/sqrt(x) which does not yet apply to GISel.
1 parent 71f6bfe commit f5891bf

File tree

4 files changed

+155
-82
lines changed

4 files changed

+155
-82
lines changed

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,10 @@ class CombinerHelper {
810810

811811
bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info) const;
812812

813+
bool matchRepeatedFPDivisor(MachineInstr &MI,
814+
SmallVector<MachineInstr *> &MatchInfo) const;
815+
void applyRepeatedFPDivisor(SmallVector<MachineInstr *> &MatchInfo) const;
816+
813817
/// Transform G_ADD(x, G_SUB(y, x)) to y.
814818
/// Transform G_ADD(G_SUB(y, x), x) to y.
815819
bool matchAddSubSameReg(MachineInstr &MI, Register &Src) const;

llvm/include/llvm/Target/GlobalISel/Combine.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ def build_fn_matchinfo :
212212
GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
213213
def unsigned_matchinfo: GIDefMatchData<"unsigned">;
214214
def register_vector_matchinfo : GIDefMatchData<"SmallVector<Register>">;
215+
def mi_vector_matchinfo : GIDefMatchData<"SmallVector<MachineInstr *>">;
215216

216217
def copy_prop : GICombineRule<
217218
(defs root:$d),
@@ -1333,6 +1334,14 @@ def combine_minmax_nan: GICombineRule<
13331334
[{ return Helper.matchCombineFMinMaxNaN(*${root}, ${info}); }]),
13341335
(apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${info}); }])>;
13351336

1337+
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
1338+
// reciprocal.
1339+
def fdiv_repeated_divison: GICombineRule<
1340+
(defs root:$root, mi_vector_matchinfo:$matchinfo),
1341+
(match (G_FDIV $dst, $src1, $src2):$root,
1342+
[{ return Helper.matchRepeatedFPDivisor(*${root}, ${matchinfo}); }]),
1343+
(apply [{ Helper.applyRepeatedFPDivisor(${matchinfo}); }])>;
1344+
13361345
// Transform (add x, (sub y, x)) -> y
13371346
// Transform (add (sub y, x), x) -> y
13381347
def add_sub_reg_frags : GICombinePatFrag<
@@ -2056,7 +2065,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
20562065
constant_fold_cast_op, fabs_fneg_fold,
20572066
intdiv_combines, mulh_combines, redundant_neg_operands,
20582067
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
2059-
intrem_combines, sub_add_reg, select_to_minmax,
2068+
intrem_combines, sub_add_reg, select_to_minmax, fdiv_repeated_divison,
20602069
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
20612070
simplify_neg_minmax, combine_concat_vector,
20622071
sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6408,6 +6408,76 @@ bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
64086408
return MatchNaN(1) || MatchNaN(2);
64096409
}
64106410

6411+
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
6412+
// reciprocal.
6413+
// E.g., (a / Y; b / Y;) -> (recip = 1.0 / Y; a * recip; b * recip)
6414+
bool CombinerHelper::matchRepeatedFPDivisor(
6415+
MachineInstr &MI, SmallVector<MachineInstr *> &MatchInfo) const {
6416+
assert(MI.getOpcode() == TargetOpcode::G_FDIV);
6417+
auto *MF = MI.getMF();
6418+
const TargetOptions &Options = MF->getTarget().Options;
6419+
6420+
Register X = MI.getOperand(1).getReg();
6421+
Register Y = MI.getOperand(2).getReg();
6422+
6423+
bool UnsafeMath = Options.UnsafeFPMath;
6424+
if (!UnsafeMath && !MI.getFlag(MachineInstr::MIFlag::FmArcp))
6425+
return false;
6426+
6427+
// Skip if current node is a reciprocal/fneg-reciprocal.
6428+
auto N0CFP = isConstantOrConstantSplatVectorFP(*MRI.getVRegDef(X), MRI);
6429+
if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
6430+
return false;
6431+
6432+
// Exit early if the target does not want this transform or if there can't
6433+
// possibly be enough uses of the divisor to make the transform worthwhile.
6434+
unsigned MinUses = getTargetLowering().combineRepeatedFPDivisors();
6435+
if (!MinUses)
6436+
return false;
6437+
6438+
// Find all FDIV users of the same divisor. For the moment we limit all
6439+
// instructions to a single BB and use the first Instr in MatchInfo as the
6440+
// dominating position.
6441+
MatchInfo.push_back(&MI);
6442+
for (auto &U : MRI.use_nodbg_instructions(Y)) {
6443+
if (&U == &MI || U.getParent() != MI.getParent())
6444+
continue;
6445+
if (U.getOpcode() == TargetOpcode::G_FDIV &&
6446+
U.getOperand(2).getReg() == Y && U.getOperand(1).getReg() != Y) {
6447+
// This division is eligible for optimization only if global unsafe math
6448+
// is enabled or if this division allows reciprocal formation.
6449+
if (UnsafeMath || U.getFlag(MachineInstr::MIFlag::FmArcp)) {
6450+
MatchInfo.push_back(&U);
6451+
if (dominates(U, *MatchInfo[0]))
6452+
std::swap(MatchInfo[0], MatchInfo.back());
6453+
}
6454+
}
6455+
}
6456+
6457+
// Now that we have the actual number of divisor uses, make sure it meets
6458+
// the minimum threshold specified by the target.
6459+
return MatchInfo.size() >= MinUses;
6460+
}
6461+
6462+
void CombinerHelper::applyRepeatedFPDivisor(
6463+
SmallVector<MachineInstr *> &MatchInfo) const {
6464+
// Generate the new div at the position of the first instruction, that we have
6465+
// ensured will dominate all other instructions.
6466+
Builder.setInsertPt(*MatchInfo[0]->getParent(), MatchInfo[0]);
6467+
LLT Ty = MRI.getType(MatchInfo[0]->getOperand(0).getReg());
6468+
auto Div = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0),
6469+
MatchInfo[0]->getOperand(2).getReg(),
6470+
MatchInfo[0]->getFlags());
6471+
6472+
// Replace all found div's with fmul instructions.
6473+
for (MachineInstr *MI : MatchInfo) {
6474+
Builder.setInsertPt(*MI->getParent(), MI);
6475+
Builder.buildFMul(MI->getOperand(0).getReg(), MI->getOperand(1).getReg(),
6476+
Div->getOperand(0).getReg(), MI->getFlags());
6477+
MI->eraseFromParent();
6478+
}
6479+
}
6480+
64116481
bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) const {
64126482
assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD");
64136483
Register LHS = MI.getOperand(1).getReg();

llvm/test/CodeGen/AArch64/fdiv-combine.ll

Lines changed: 71 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,14 @@
1212
; =>
1313
; recip = 1.0 / D; a * recip; b * recip; c * recip;
1414
define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
15-
; CHECK-SD-LABEL: three_fdiv_float:
16-
; CHECK-SD: // %bb.0:
17-
; CHECK-SD-NEXT: fmov s4, #1.00000000
18-
; CHECK-SD-NEXT: fdiv s4, s4, s0
19-
; CHECK-SD-NEXT: fmul s0, s1, s4
20-
; CHECK-SD-NEXT: fmul s1, s2, s4
21-
; CHECK-SD-NEXT: fmul s2, s3, s4
22-
; CHECK-SD-NEXT: b foo_3f
23-
;
24-
; CHECK-GI-LABEL: three_fdiv_float:
25-
; CHECK-GI: // %bb.0:
26-
; CHECK-GI-NEXT: fdiv s4, s1, s0
27-
; CHECK-GI-NEXT: fdiv s1, s2, s0
28-
; CHECK-GI-NEXT: fdiv s2, s3, s0
29-
; CHECK-GI-NEXT: fmov s0, s4
30-
; CHECK-GI-NEXT: b foo_3f
15+
; CHECK-LABEL: three_fdiv_float:
16+
; CHECK: // %bb.0:
17+
; CHECK-NEXT: fmov s4, #1.00000000
18+
; CHECK-NEXT: fdiv s4, s4, s0
19+
; CHECK-NEXT: fmul s0, s1, s4
20+
; CHECK-NEXT: fmul s1, s2, s4
21+
; CHECK-NEXT: fmul s2, s3, s4
22+
; CHECK-NEXT: b foo_3f
3123
%div = fdiv float %a, %D
3224
%div1 = fdiv float %b, %D
3325
%div2 = fdiv float %c, %D
@@ -36,22 +28,14 @@ define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
3628
}
3729

3830
define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
39-
; CHECK-SD-LABEL: three_fdiv_double:
40-
; CHECK-SD: // %bb.0:
41-
; CHECK-SD-NEXT: fmov d4, #1.00000000
42-
; CHECK-SD-NEXT: fdiv d4, d4, d0
43-
; CHECK-SD-NEXT: fmul d0, d1, d4
44-
; CHECK-SD-NEXT: fmul d1, d2, d4
45-
; CHECK-SD-NEXT: fmul d2, d3, d4
46-
; CHECK-SD-NEXT: b foo_3d
47-
;
48-
; CHECK-GI-LABEL: three_fdiv_double:
49-
; CHECK-GI: // %bb.0:
50-
; CHECK-GI-NEXT: fdiv d4, d1, d0
51-
; CHECK-GI-NEXT: fdiv d1, d2, d0
52-
; CHECK-GI-NEXT: fdiv d2, d3, d0
53-
; CHECK-GI-NEXT: fmov d0, d4
54-
; CHECK-GI-NEXT: b foo_3d
31+
; CHECK-LABEL: three_fdiv_double:
32+
; CHECK: // %bb.0:
33+
; CHECK-NEXT: fmov d4, #1.00000000
34+
; CHECK-NEXT: fdiv d4, d4, d0
35+
; CHECK-NEXT: fmul d0, d1, d4
36+
; CHECK-NEXT: fmul d1, d2, d4
37+
; CHECK-NEXT: fmul d2, d3, d4
38+
; CHECK-NEXT: b foo_3d
5539
%div = fdiv double %a, %D
5640
%div1 = fdiv double %b, %D
5741
%div2 = fdiv double %c, %D
@@ -60,22 +44,14 @@ define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
6044
}
6145

6246
define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
63-
; CHECK-SD-LABEL: three_fdiv_4xfloat:
64-
; CHECK-SD: // %bb.0:
65-
; CHECK-SD-NEXT: fmov v4.4s, #1.00000000
66-
; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s
67-
; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s
68-
; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s
69-
; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s
70-
; CHECK-SD-NEXT: b foo_3_4xf
71-
;
72-
; CHECK-GI-LABEL: three_fdiv_4xfloat:
73-
; CHECK-GI: // %bb.0:
74-
; CHECK-GI-NEXT: fdiv v4.4s, v1.4s, v0.4s
75-
; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v0.4s
76-
; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v0.4s
77-
; CHECK-GI-NEXT: mov v0.16b, v4.16b
78-
; CHECK-GI-NEXT: b foo_3_4xf
47+
; CHECK-LABEL: three_fdiv_4xfloat:
48+
; CHECK: // %bb.0:
49+
; CHECK-NEXT: fmov v4.4s, #1.00000000
50+
; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s
51+
; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s
52+
; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s
53+
; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s
54+
; CHECK-NEXT: b foo_3_4xf
7955
%div = fdiv <4 x float> %a, %D
8056
%div1 = fdiv <4 x float> %b, %D
8157
%div2 = fdiv <4 x float> %c, %D
@@ -84,22 +60,14 @@ define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b,
8460
}
8561

8662
define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
87-
; CHECK-SD-LABEL: three_fdiv_2xdouble:
88-
; CHECK-SD: // %bb.0:
89-
; CHECK-SD-NEXT: fmov v4.2d, #1.00000000
90-
; CHECK-SD-NEXT: fdiv v4.2d, v4.2d, v0.2d
91-
; CHECK-SD-NEXT: fmul v0.2d, v1.2d, v4.2d
92-
; CHECK-SD-NEXT: fmul v1.2d, v2.2d, v4.2d
93-
; CHECK-SD-NEXT: fmul v2.2d, v3.2d, v4.2d
94-
; CHECK-SD-NEXT: b foo_3_2xd
95-
;
96-
; CHECK-GI-LABEL: three_fdiv_2xdouble:
97-
; CHECK-GI: // %bb.0:
98-
; CHECK-GI-NEXT: fdiv v4.2d, v1.2d, v0.2d
99-
; CHECK-GI-NEXT: fdiv v1.2d, v2.2d, v0.2d
100-
; CHECK-GI-NEXT: fdiv v2.2d, v3.2d, v0.2d
101-
; CHECK-GI-NEXT: mov v0.16b, v4.16b
102-
; CHECK-GI-NEXT: b foo_3_2xd
63+
; CHECK-LABEL: three_fdiv_2xdouble:
64+
; CHECK: // %bb.0:
65+
; CHECK-NEXT: fmov v4.2d, #1.00000000
66+
; CHECK-NEXT: fdiv v4.2d, v4.2d, v0.2d
67+
; CHECK-NEXT: fmul v0.2d, v1.2d, v4.2d
68+
; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d
69+
; CHECK-NEXT: fmul v2.2d, v3.2d, v4.2d
70+
; CHECK-NEXT: b foo_3_2xd
10371
%div = fdiv <2 x double> %a, %D
10472
%div1 = fdiv <2 x double> %b, %D
10573
%div2 = fdiv <2 x double> %c, %D
@@ -135,26 +103,47 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 {
135103
ret void
136104
}
137105

138-
define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
139-
; CHECK-SD-LABEL: splat_three_fdiv_4xfloat:
106+
define void @four_fdiv_multi_float(float %D, float %a, float %b, float %c) #0 {
107+
; CHECK-SD-LABEL: four_fdiv_multi_float:
140108
; CHECK-SD: // %bb.0:
141-
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
142-
; CHECK-SD-NEXT: fmov v4.4s, #1.00000000
143-
; CHECK-SD-NEXT: dup v0.4s, v0.s[0]
144-
; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s
145-
; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s
146-
; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s
147-
; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s
148-
; CHECK-SD-NEXT: b foo_3_4xf
109+
; CHECK-SD-NEXT: fmov s4, #1.00000000
110+
; CHECK-SD-NEXT: fdiv s5, s4, s0
111+
; CHECK-SD-NEXT: fmul s4, s1, s5
112+
; CHECK-SD-NEXT: fmul s1, s2, s5
113+
; CHECK-SD-NEXT: fmul s2, s3, s5
114+
; CHECK-SD-NEXT: fmul s3, s0, s5
115+
; CHECK-SD-NEXT: fmov s0, s4
116+
; CHECK-SD-NEXT: b foo_4f
149117
;
150-
; CHECK-GI-LABEL: splat_three_fdiv_4xfloat:
118+
; CHECK-GI-LABEL: four_fdiv_multi_float:
151119
; CHECK-GI: // %bb.0:
152-
; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
153-
; CHECK-GI-NEXT: dup v4.4s, v0.s[0]
154-
; CHECK-GI-NEXT: fdiv v0.4s, v1.4s, v4.4s
155-
; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v4.4s
156-
; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v4.4s
157-
; CHECK-GI-NEXT: b foo_3_4xf
120+
; CHECK-GI-NEXT: fmov s4, #1.00000000
121+
; CHECK-GI-NEXT: fdiv s5, s4, s0
122+
; CHECK-GI-NEXT: fdiv s4, s0, s0
123+
; CHECK-GI-NEXT: fmul s0, s1, s5
124+
; CHECK-GI-NEXT: fmul s1, s2, s5
125+
; CHECK-GI-NEXT: fmul s2, s3, s5
126+
; CHECK-GI-NEXT: fmov s3, s4
127+
; CHECK-GI-NEXT: b foo_4f
128+
%div = fdiv float %a, %D
129+
%div1 = fdiv float %b, %D
130+
%div2 = fdiv float %c, %D
131+
%div3 = fdiv float %D, %D
132+
tail call void @foo_4f(float %div, float %div1, float %div2, float %div3)
133+
ret void
134+
}
135+
136+
define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
137+
; CHECK-LABEL: splat_three_fdiv_4xfloat:
138+
; CHECK: // %bb.0:
139+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
140+
; CHECK-NEXT: fmov v4.4s, #1.00000000
141+
; CHECK-NEXT: dup v0.4s, v0.s[0]
142+
; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s
143+
; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s
144+
; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s
145+
; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s
146+
; CHECK-NEXT: b foo_3_4xf
158147
%D.ins = insertelement <4 x float> poison, float %D, i64 0
159148
%splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
160149
%div = fdiv <4 x float> %a, %splat
@@ -256,6 +245,7 @@ entry:
256245
}
257246

258247
declare void @foo_3f(float, float, float)
248+
declare void @foo_4f(float, float, float, float)
259249
declare void @foo_3d(double, double, double)
260250
declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>)
261251
declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)

0 commit comments

Comments
 (0)