diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 7d7b5364d6b68..95f43347adecc 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -810,6 +810,10 @@ class CombinerHelper { bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info) const; + bool matchRepeatedFPDivisor(MachineInstr &MI, + SmallVector &MatchInfo) const; + void applyRepeatedFPDivisor(SmallVector &MatchInfo) const; + /// Transform G_ADD(x, G_SUB(y, x)) to y. /// Transform G_ADD(G_SUB(y, x), x) to y. bool matchAddSubSameReg(MachineInstr &MI, Register &Src) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 6033d80e717d3..4ed0f13d801a2 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -212,6 +212,7 @@ def build_fn_matchinfo : GIDefMatchData<"std::function">; def unsigned_matchinfo: GIDefMatchData<"unsigned">; def register_vector_matchinfo : GIDefMatchData<"SmallVector">; +def mi_vector_matchinfo : GIDefMatchData<"SmallVector">; def copy_prop : GICombineRule< (defs root:$d), @@ -1333,6 +1334,14 @@ def combine_minmax_nan: GICombineRule< [{ return Helper.matchCombineFMinMaxNaN(*${root}, ${info}); }]), (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${info}); }])>; +// Combine multiple FDIVs with the same divisor into multiple FMULs by the +// reciprocal. +def fdiv_repeated_divison: GICombineRule< + (defs root:$root, mi_vector_matchinfo:$matchinfo), + (match (G_FDIV $dst, $src1, $src2):$root, + [{ return Helper.matchRepeatedFPDivisor(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyRepeatedFPDivisor(${matchinfo}); }])>; + // Transform (add x, (sub y, x)) -> y // Transform (add (sub y, x), x) -> y def add_sub_reg_frags : GICombinePatFrag< @@ -2056,7 +2065,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, constant_fold_cast_op, fabs_fneg_fold, intdiv_combines, mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, - intrem_combines, sub_add_reg, select_to_minmax, + intrem_combines, sub_add_reg, select_to_minmax, fdiv_repeated_divison, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3b11d0848d300..e590f054cdc96 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6408,6 +6408,73 @@ bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI, return MatchNaN(1) || MatchNaN(2); } +// Combine multiple FDIVs with the same divisor into multiple FMULs by the +// reciprocal. +// E.g., (a / Y; b / Y;) -> (recip = 1.0 / Y; a * recip; b * recip) +bool CombinerHelper::matchRepeatedFPDivisor( + MachineInstr &MI, SmallVector &MatchInfo) const { + assert(MI.getOpcode() == TargetOpcode::G_FDIV); + + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + + if (!MI.getFlag(MachineInstr::MIFlag::FmArcp)) + return false; + + // Skip if current node is a reciprocal/fneg-reciprocal. + auto N0CFP = isConstantOrConstantSplatVectorFP(*MRI.getVRegDef(X), MRI); + if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) + return false; + + // Exit early if the target does not want this transform or if there can't + // possibly be enough uses of the divisor to make the transform worthwhile. + unsigned MinUses = getTargetLowering().combineRepeatedFPDivisors(); + if (!MinUses) + return false; + + // Find all FDIV users of the same divisor. For the moment we limit all + // instructions to a single BB and use the first Instr in MatchInfo as the + // dominating position. + MatchInfo.push_back(&MI); + for (auto &U : MRI.use_nodbg_instructions(Y)) { + if (&U == &MI || U.getParent() != MI.getParent()) + continue; + if (U.getOpcode() == TargetOpcode::G_FDIV && + U.getOperand(2).getReg() == Y && U.getOperand(1).getReg() != Y) { + // This division is eligible for optimization only if global unsafe math + // is enabled or if this division allows reciprocal formation. + if (U.getFlag(MachineInstr::MIFlag::FmArcp)) { + MatchInfo.push_back(&U); + if (dominates(U, *MatchInfo[0])) + std::swap(MatchInfo[0], MatchInfo.back()); + } + } + } + + // Now that we have the actual number of divisor uses, make sure it meets + // the minimum threshold specified by the target. + return MatchInfo.size() >= MinUses; +} + +void CombinerHelper::applyRepeatedFPDivisor( + SmallVector &MatchInfo) const { + // Generate the new div at the position of the first instruction, that we have + // ensured will dominate all other instructions. + Builder.setInsertPt(*MatchInfo[0]->getParent(), MatchInfo[0]); + LLT Ty = MRI.getType(MatchInfo[0]->getOperand(0).getReg()); + auto Div = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0), + MatchInfo[0]->getOperand(2).getReg(), + MatchInfo[0]->getFlags()); + + // Replace all found div's with fmul instructions. + for (MachineInstr *MI : MatchInfo) { + Builder.setInsertPt(*MI->getParent(), MI); + Builder.buildFMul(MI->getOperand(0).getReg(), MI->getOperand(1).getReg(), + Div->getOperand(0).getReg(), MI->getFlags()); + MI->eraseFromParent(); + } +} + bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) const { assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); Register LHS = MI.getOperand(1).getReg(); diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll index d8f7f0a306684..592665fc1014a 100644 --- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll +++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll @@ -12,97 +12,65 @@ ; => ; recip = 1.0 / D; a * recip; b * recip; c * recip; define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 { -; CHECK-SD-LABEL: three_fdiv_float: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov s4, #1.00000000 -; CHECK-SD-NEXT: fdiv s4, s4, s0 -; CHECK-SD-NEXT: fmul s0, s1, s4 -; CHECK-SD-NEXT: fmul s1, s2, s4 -; CHECK-SD-NEXT: fmul s2, s3, s4 -; CHECK-SD-NEXT: b foo_3f -; -; CHECK-GI-LABEL: three_fdiv_float: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv s4, s1, s0 -; CHECK-GI-NEXT: fdiv s1, s2, s0 -; CHECK-GI-NEXT: fdiv s2, s3, s0 -; CHECK-GI-NEXT: fmov s0, s4 -; CHECK-GI-NEXT: b foo_3f - %div = fdiv float %a, %D - %div1 = fdiv float %b, %D - %div2 = fdiv float %c, %D +; CHECK-LABEL: three_fdiv_float: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s4, #1.00000000 +; CHECK-NEXT: fdiv s4, s4, s0 +; CHECK-NEXT: fmul s0, s1, s4 +; CHECK-NEXT: fmul s1, s2, s4 +; CHECK-NEXT: fmul s2, s3, s4 +; CHECK-NEXT: b foo_3f + %div = fdiv arcp float %a, %D + %div1 = fdiv arcp float %b, %D + %div2 = fdiv arcp float %c, %D tail call void @foo_3f(float %div, float %div1, float %div2) ret void } define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 { -; CHECK-SD-LABEL: three_fdiv_double: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov d4, #1.00000000 -; CHECK-SD-NEXT: fdiv d4, d4, d0 -; CHECK-SD-NEXT: fmul d0, d1, d4 -; CHECK-SD-NEXT: fmul d1, d2, d4 -; CHECK-SD-NEXT: fmul d2, d3, d4 -; CHECK-SD-NEXT: b foo_3d -; -; CHECK-GI-LABEL: three_fdiv_double: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv d4, d1, d0 -; CHECK-GI-NEXT: fdiv d1, d2, d0 -; CHECK-GI-NEXT: fdiv d2, d3, d0 -; CHECK-GI-NEXT: fmov d0, d4 -; CHECK-GI-NEXT: b foo_3d - %div = fdiv double %a, %D - %div1 = fdiv double %b, %D - %div2 = fdiv double %c, %D +; CHECK-LABEL: three_fdiv_double: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d4, #1.00000000 +; CHECK-NEXT: fdiv d4, d4, d0 +; CHECK-NEXT: fmul d0, d1, d4 +; CHECK-NEXT: fmul d1, d2, d4 +; CHECK-NEXT: fmul d2, d3, d4 +; CHECK-NEXT: b foo_3d + %div = fdiv arcp double %a, %D + %div1 = fdiv arcp double %b, %D + %div2 = fdiv arcp double %c, %D tail call void @foo_3d(double %div, double %div1, double %div2) ret void } define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { -; CHECK-SD-LABEL: three_fdiv_4xfloat: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov v4.4s, #1.00000000 -; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s -; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s -; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s -; CHECK-SD-NEXT: b foo_3_4xf -; -; CHECK-GI-LABEL: three_fdiv_4xfloat: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv v4.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v0.4s -; CHECK-GI-NEXT: mov v0.16b, v4.16b -; CHECK-GI-NEXT: b foo_3_4xf - %div = fdiv <4 x float> %a, %D - %div1 = fdiv <4 x float> %b, %D - %div2 = fdiv <4 x float> %c, %D +; CHECK-LABEL: three_fdiv_4xfloat: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov v4.4s, #1.00000000 +; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s +; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s +; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s +; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s +; CHECK-NEXT: b foo_3_4xf + %div = fdiv arcp <4 x float> %a, %D + %div1 = fdiv arcp <4 x float> %b, %D + %div2 = fdiv arcp <4 x float> %c, %D tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2) ret void } define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { -; CHECK-SD-LABEL: three_fdiv_2xdouble: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov v4.2d, #1.00000000 -; CHECK-SD-NEXT: fdiv v4.2d, v4.2d, v0.2d -; CHECK-SD-NEXT: fmul v0.2d, v1.2d, v4.2d -; CHECK-SD-NEXT: fmul v1.2d, v2.2d, v4.2d -; CHECK-SD-NEXT: fmul v2.2d, v3.2d, v4.2d -; CHECK-SD-NEXT: b foo_3_2xd -; -; CHECK-GI-LABEL: three_fdiv_2xdouble: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv v4.2d, v1.2d, v0.2d -; CHECK-GI-NEXT: fdiv v1.2d, v2.2d, v0.2d -; CHECK-GI-NEXT: fdiv v2.2d, v3.2d, v0.2d -; CHECK-GI-NEXT: mov v0.16b, v4.16b -; CHECK-GI-NEXT: b foo_3_2xd - %div = fdiv <2 x double> %a, %D - %div1 = fdiv <2 x double> %b, %D - %div2 = fdiv <2 x double> %c, %D +; CHECK-LABEL: three_fdiv_2xdouble: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov v4.2d, #1.00000000 +; CHECK-NEXT: fdiv v4.2d, v4.2d, v0.2d +; CHECK-NEXT: fmul v0.2d, v1.2d, v4.2d +; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d +; CHECK-NEXT: fmul v2.2d, v3.2d, v4.2d +; CHECK-NEXT: b foo_3_2xd + %div = fdiv arcp <2 x double> %a, %D + %div1 = fdiv arcp <2 x double> %b, %D + %div2 = fdiv arcp <2 x double> %c, %D tail call void @foo_3_2xd(<2 x double> %div, <2 x double> %div1, <2 x double> %div2) ret void } @@ -116,8 +84,8 @@ define void @two_fdiv_float(float %D, float %a, float %b) #0 { ; CHECK-NEXT: fdiv s1, s2, s0 ; CHECK-NEXT: fmov s0, s3 ; CHECK-NEXT: b foo_2f - %div = fdiv float %a, %D - %div1 = fdiv float %b, %D + %div = fdiv arcp float %a, %D + %div1 = fdiv arcp float %b, %D tail call void @foo_2f(float %div, float %div1) ret void } @@ -129,37 +97,58 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 { ; CHECK-NEXT: fdiv d1, d2, d0 ; CHECK-NEXT: fmov d0, d3 ; CHECK-NEXT: b foo_2d - %div = fdiv double %a, %D - %div1 = fdiv double %b, %D + %div = fdiv arcp double %a, %D + %div1 = fdiv arcp double %b, %D tail call void @foo_2d(double %div, double %div1) ret void } -define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { -; CHECK-SD-LABEL: splat_three_fdiv_4xfloat: +define void @four_fdiv_multi_float(float %D, float %a, float %b, float %c) #0 { +; CHECK-SD-LABEL: four_fdiv_multi_float: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-SD-NEXT: fmov v4.4s, #1.00000000 -; CHECK-SD-NEXT: dup v0.4s, v0.s[0] -; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s -; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s -; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s -; CHECK-SD-NEXT: b foo_3_4xf +; CHECK-SD-NEXT: fmov s4, #1.00000000 +; CHECK-SD-NEXT: fdiv s5, s4, s0 +; CHECK-SD-NEXT: fmul s4, s1, s5 +; CHECK-SD-NEXT: fmul s1, s2, s5 +; CHECK-SD-NEXT: fmul s2, s3, s5 +; CHECK-SD-NEXT: fmul s3, s0, s5 +; CHECK-SD-NEXT: fmov s0, s4 +; CHECK-SD-NEXT: b foo_4f ; -; CHECK-GI-LABEL: splat_three_fdiv_4xfloat: +; CHECK-GI-LABEL: four_fdiv_multi_float: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-GI-NEXT: dup v4.4s, v0.s[0] -; CHECK-GI-NEXT: fdiv v0.4s, v1.4s, v4.4s -; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v4.4s -; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: b foo_3_4xf +; CHECK-GI-NEXT: fmov s4, #1.00000000 +; CHECK-GI-NEXT: fdiv s5, s4, s0 +; CHECK-GI-NEXT: fdiv s4, s0, s0 +; CHECK-GI-NEXT: fmul s0, s1, s5 +; CHECK-GI-NEXT: fmul s1, s2, s5 +; CHECK-GI-NEXT: fmul s2, s3, s5 +; CHECK-GI-NEXT: fmov s3, s4 +; CHECK-GI-NEXT: b foo_4f + %div = fdiv arcp float %a, %D + %div1 = fdiv arcp float %b, %D + %div2 = fdiv arcp float %c, %D + %div3 = fdiv arcp float %D, %D + tail call void @foo_4f(float %div, float %div1, float %div2, float %div3) + ret void +} + +define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; CHECK-LABEL: splat_three_fdiv_4xfloat: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: fmov v4.4s, #1.00000000 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s +; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s +; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s +; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s +; CHECK-NEXT: b foo_3_4xf %D.ins = insertelement <4 x float> poison, float %D, i64 0 %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer - %div = fdiv <4 x float> %a, %splat - %div1 = fdiv <4 x float> %b, %splat - %div2 = fdiv <4 x float> %c, %splat + %div = fdiv arcp <4 x float> %a, %splat + %div1 = fdiv arcp <4 x float> %b, %splat + %div2 = fdiv arcp <4 x float> %c, %splat tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2) ret void } @@ -183,7 +172,7 @@ define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 { entry: %D.ins = insertelement <4 x float> poison, float %D, i64 0 %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer - %div = fdiv <4 x float> %a, %splat + %div = fdiv arcp <4 x float> %a, %splat ret <4 x float> %div } @@ -198,7 +187,7 @@ define @splat_fdiv_nxv4f32(float %D, % entry: %D.ins = insertelement poison, float %D, i64 0 %splat = shufflevector %D.ins, poison, zeroinitializer - %div = fdiv %a, %splat + %div = fdiv arcp %a, %splat ret %div } @@ -215,9 +204,9 @@ define void @splat_three_fdiv_nxv4f32(float %D, %a, poison, float %D, i64 0 %splat = shufflevector %D.ins, poison, zeroinitializer - %div = fdiv %a, %splat - %div1 = fdiv %b, %splat - %div2 = fdiv %c, %splat + %div = fdiv arcp %a, %splat + %div1 = fdiv arcp %b, %splat + %div2 = fdiv arcp %c, %splat tail call void @foo_3_nxv4f32( %div, %div1, %div2) ret void } @@ -233,7 +222,7 @@ define @splat_fdiv_nxv2f64(double %D, poison, double %D, i64 0 %splat = shufflevector %D.ins, poison, zeroinitializer - %div = fdiv %a, %splat + %div = fdiv arcp %a, %splat ret %div } @@ -249,13 +238,14 @@ define void @splat_two_fdiv_nxv2f64(double %D, %a, poison, double %D, i64 0 %splat = shufflevector %D.ins, poison, zeroinitializer - %div = fdiv %a, %splat - %div1 = fdiv %b, %splat + %div = fdiv arcp %a, %splat + %div1 = fdiv arcp %b, %splat tail call void @foo_2_nxv2f64( %div, %div1) ret void } declare void @foo_3f(float, float, float) +declare void @foo_4f(float, float, float, float) declare void @foo_3d(double, double, double) declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>) declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)