From 83257edb4d088d2ab471b5454b1f799e57a84276 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 24 Jun 2025 16:04:45 +0900
Subject: [PATCH] DAG: Use fast variants of fast math libcalls

Hexagon currently has an untested global flag to control fast
math variants of libcalls. Add fast variants as explicit libcall
options so this can be a flag based lowering decision, and implement
it. I have no idea what fast math flags the hexagon case requires,
so I picked the maximally potentially relevant set of flags although
this probably is refinable per call. Looking in compiler-rt, I'm not
sure if the fast variants are anything more than aliases.
---
 llvm/include/llvm/IR/RuntimeLibcalls.td       |  26 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 125 ++++--
 llvm/lib/IR/RuntimeLibcalls.cpp               |  41 +-
 .../CodeGen/Hexagon/fast-math-libcalls.ll     | 369 ++++++++++++++++++
 4 files changed, 501 insertions(+), 60 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index cd676e1661d62..3ef1803018756 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -62,13 +62,24 @@ foreach IntTy = ["I32", "I64", "I128"] in {
 
 foreach FPTy = ["F32", "F64", "F80", "F128", "PPCF128"] in {
   def ADD_#FPTy : RuntimeLibcall;
+  def FAST_ADD_#FPTy : RuntimeLibcall;
+
   def SUB_#FPTy : RuntimeLibcall;
+  def FAST_SUB_#FPTy : RuntimeLibcall;
+
   def MUL_#FPTy : RuntimeLibcall;
+  def FAST_MUL_#FPTy : RuntimeLibcall;
+
   def DIV_#FPTy : RuntimeLibcall;
+  def FAST_DIV_#FPTy : RuntimeLibcall;
+
   def REM_#FPTy : RuntimeLibcall;
   def FMA_#FPTy : RuntimeLibcall;
   def POWI_#FPTy : RuntimeLibcall;
+
   def SQRT_#FPTy : RuntimeLibcall;
+  def FAST_SQRT_#FPTy : RuntimeLibcall;
+
   def CBRT_#FPTy : RuntimeLibcall;
   def LOG_#FPTy : RuntimeLibcall;
   def LOG_FINITE_#FPTy : RuntimeLibcall;
@@ -1411,27 +1422,26 @@ def __hexagon_moddi3 : RuntimeLibcallImpl<SREM_I64>;
 def __hexagon_umodsi3 : RuntimeLibcallImpl<UREM_I32>;
 def __hexagon_umoddi3 : RuntimeLibcallImpl<UREM_I64>;
 
-// FIXME: "Fast" versions should be treated as a separate RTLIB::FAST_* function
 def __hexagon_adddf3 : RuntimeLibcallImpl<ADD_F64>;
-def __hexagon_fast_adddf3 : RuntimeLibcallImpl<ADD_F64>;
+def __hexagon_fast_adddf3 : RuntimeLibcallImpl<FAST_ADD_F64>;
 
 def __hexagon_subdf3 : RuntimeLibcallImpl<SUB_F64>;
-def __hexagon_fast_subdf3 : RuntimeLibcallImpl<SUB_F64>;
+def __hexagon_fast_subdf3 : RuntimeLibcallImpl<FAST_SUB_F64>;
 
 def __hexagon_muldf3 : RuntimeLibcallImpl<MUL_F64>;
-def __hexagon_fast_muldf3 : RuntimeLibcallImpl<MUL_F64>;
+def __hexagon_fast_muldf3 : RuntimeLibcallImpl<FAST_MUL_F64>;
 
 def __hexagon_divdf3 : RuntimeLibcallImpl<DIV_F64>;
-def __hexagon_fast_divdf3 : RuntimeLibcallImpl<DIV_F64>;
+def __hexagon_fast_divdf3 : RuntimeLibcallImpl<FAST_DIV_F64>;
 
 def __hexagon_divsf3 : RuntimeLibcallImpl<DIV_F32>;
-def __hexagon_fast_divsf3 : RuntimeLibcallImpl<DIV_F32>;
+def __hexagon_fast_divsf3 : RuntimeLibcallImpl<FAST_DIV_F32>;
 
 def __hexagon_sqrtf : RuntimeLibcallImpl<SQRT_F32>;
-def __hexagon_fast2_sqrtf : RuntimeLibcallImpl<SQRT_F32>;
+def __hexagon_fast2_sqrtf : RuntimeLibcallImpl<FAST_SQRT_F32>;
 
 // This is the only fast library function for sqrtd.
-def __hexagon_fast2_sqrtdf2 : RuntimeLibcallImpl<SQRT_F64>;
+def __hexagon_fast2_sqrtdf2 : RuntimeLibcallImpl<FAST_SQRT_F64>;
 
 def __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
     : RuntimeLibcallImpl<HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a48dd0e5fedba..10f49c2b65768 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -140,12 +140,19 @@ class SelectionDAGLegalize {
                        RTLIB::Libcall Call_F128,
                        RTLIB::Libcall Call_PPCF128,
                        SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
-                           RTLIB::Libcall Call_I8,
-                           RTLIB::Libcall Call_I16,
-                           RTLIB::Libcall Call_I32,
-                           RTLIB::Libcall Call_I64,
-                           RTLIB::Libcall Call_I128);
+
+  void
+  ExpandFastFPLibCall(SDNode *Node, bool IsFast,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F32,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F64,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F80,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F128,
+                      std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_PPCF128,
+                      SmallVectorImpl<SDValue> &Results);
+
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
+                           RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32,
+                           RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128);
   void ExpandArgFPLibCall(SDNode *Node,
                           RTLIB::Libcall Call_F32, RTLIB::Libcall Call_F64,
                           RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
@@ -2229,6 +2236,37 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
   ExpandFPLibCall(Node, LC, Results);
 }
 
+void SelectionDAGLegalize::ExpandFastFPLibCall(
+    SDNode *Node, bool IsFast,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F32,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F64,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F80,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_F128,
+    std::pair<RTLIB::Libcall, RTLIB::Libcall> Call_PPCF128,
+    SmallVectorImpl<SDValue> &Results) {
+
+  EVT VT = Node->getSimpleValueType(0);
+
+  RTLIB::Libcall LC;
+
+  // FIXME: Probably should define fast to respect nan/inf and only be
+  // approximate functions.
+
+  if (IsFast) {
+    LC = RTLIB::getFPLibCall(VT, Call_F32.first, Call_F64.first, Call_F80.first,
+                             Call_F128.first, Call_PPCF128.first);
+  }
+
+  if (!IsFast || TLI.getLibcallImpl(LC) == RTLIB::Unsupported) {
+    // Fall back if we don't have a fast implementation.
+    LC = RTLIB::getFPLibCall(VT, Call_F32.second, Call_F64.second,
+                             Call_F80.second, Call_F128.second,
+                             Call_PPCF128.second);
+  }
+
+  ExpandFPLibCall(Node, LC, Results);
+}
+
 SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
                                                RTLIB::Libcall Call_I8,
                                                RTLIB::Libcall Call_I16,
@@ -4515,6 +4553,18 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   return true;
 }
 
+/// Return if we can use the FAST_* variant of a math libcall for the node.
+/// FIXME: This is just guessing, we probably should have unique specific sets
+/// flags required per libcall.
+static bool canUseFastMathLibcall(const SDNode *Node) {
+  // FIXME: Probably should define fast to respect nan/inf and only be
+  // approximate functions.
+
+  SDNodeFlags Flags = Node->getFlags();
+  return Flags.hasApproximateFuncs() && Flags.hasNoNaNs() &&
+         Flags.hasNoInfs() && Flags.hasNoSignedZeros();
+}
+
 void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to convert node to libcall\n");
   SmallVector<SDValue, 8> Results;
@@ -4635,11 +4685,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::FMAXIMUM_NUM_PPCF128, Results);
     break;
   case ISD::FSQRT:
-  case ISD::STRICT_FSQRT:
-    ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
-                    RTLIB::SQRT_F80, RTLIB::SQRT_F128,
-                    RTLIB::SQRT_PPCF128, Results);
+  case ISD::STRICT_FSQRT: {
+    // FIXME: Probably should define fast to respect nan/inf and only be
+    // approximate functions.
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_SQRT_F32, RTLIB::SQRT_F32},
+                        {RTLIB::FAST_SQRT_F64, RTLIB::SQRT_F64},
+                        {RTLIB::FAST_SQRT_F80, RTLIB::SQRT_F80},
+                        {RTLIB::FAST_SQRT_F128, RTLIB::SQRT_F128},
+                        {RTLIB::FAST_SQRT_PPCF128, RTLIB::SQRT_PPCF128},
+                        Results);
     break;
+  }
   case ISD::FCBRT:
     ExpandFPLibCall(Node, RTLIB::CBRT_F32, RTLIB::CBRT_F64,
                     RTLIB::CBRT_F80, RTLIB::CBRT_F128,
@@ -4876,11 +4933,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                        RTLIB::LLRINT_PPCF128, Results);
     break;
   case ISD::FDIV:
-  case ISD::STRICT_FDIV:
-    ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
-                    RTLIB::DIV_F80, RTLIB::DIV_F128,
-                    RTLIB::DIV_PPCF128, Results);
+  case ISD::STRICT_FDIV: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_DIV_F32, RTLIB::DIV_F32},
+                        {RTLIB::FAST_DIV_F64, RTLIB::DIV_F64},
+                        {RTLIB::FAST_DIV_F80, RTLIB::DIV_F80},
+                        {RTLIB::FAST_DIV_F128, RTLIB::DIV_F128},
+                        {RTLIB::FAST_DIV_PPCF128, RTLIB::DIV_PPCF128}, Results);
     break;
+  }
   case ISD::FREM:
   case ISD::STRICT_FREM:
     ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
@@ -4894,17 +4955,25 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::FMA_PPCF128, Results);
     break;
   case ISD::FADD:
-  case ISD::STRICT_FADD:
-    ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
-                    RTLIB::ADD_F80, RTLIB::ADD_F128,
-                    RTLIB::ADD_PPCF128, Results);
+  case ISD::STRICT_FADD: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_ADD_F32, RTLIB::ADD_F32},
+                        {RTLIB::FAST_ADD_F64, RTLIB::ADD_F64},
+                        {RTLIB::FAST_ADD_F80, RTLIB::ADD_F80},
+                        {RTLIB::FAST_ADD_F128, RTLIB::ADD_F128},
+                        {RTLIB::FAST_ADD_PPCF128, RTLIB::ADD_PPCF128}, Results);
     break;
+  }
   case ISD::FMUL:
-  case ISD::STRICT_FMUL:
-    ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
-                    RTLIB::MUL_F80, RTLIB::MUL_F128,
-                    RTLIB::MUL_PPCF128, Results);
+  case ISD::STRICT_FMUL: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_MUL_F32, RTLIB::MUL_F32},
+                        {RTLIB::FAST_MUL_F64, RTLIB::MUL_F64},
+                        {RTLIB::FAST_MUL_F80, RTLIB::MUL_F80},
+                        {RTLIB::FAST_MUL_F128, RTLIB::MUL_F128},
+                        {RTLIB::FAST_MUL_PPCF128, RTLIB::MUL_PPCF128}, Results);
     break;
+  }
   case ISD::FP16_TO_FP:
     if (Node->getValueType(0) == MVT::f32) {
       Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false).first);
@@ -5077,11 +5146,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   }
   case ISD::FSUB:
-  case ISD::STRICT_FSUB:
-    ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
-                    RTLIB::SUB_F80, RTLIB::SUB_F128,
-                    RTLIB::SUB_PPCF128, Results);
+  case ISD::STRICT_FSUB: {
+    ExpandFastFPLibCall(Node, canUseFastMathLibcall(Node),
+                        {RTLIB::FAST_SUB_F32, RTLIB::SUB_F32},
+                        {RTLIB::FAST_SUB_F64, RTLIB::SUB_F64},
+                        {RTLIB::FAST_SUB_F80, RTLIB::SUB_F80},
+                        {RTLIB::FAST_SUB_F128, RTLIB::SUB_F128},
+                        {RTLIB::FAST_SUB_PPCF128, RTLIB::SUB_PPCF128}, Results);
     break;
+  }
   case ISD::SREM:
     Results.push_back(ExpandIntLibCall(Node, true,
                                        RTLIB::SREM_I8,
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 64c9415c54d4d..c4fd40f313077 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -18,10 +18,6 @@ using namespace RTLIB;
 #undef GET_INIT_RUNTIME_LIBCALL_NAMES
 #undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 
-static cl::opt<bool>
-    HexagonEnableFastMathRuntimeCalls("hexagon-fast-math", cl::Hidden,
-                                      cl::desc("Enable Fast Math processing"));
-
 static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
                                FloatABI::ABIType FloatABIType,
                                EABI EABIVersion) {
@@ -268,32 +264,25 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
     setLibcallImpl(RTLIB::UREM_I32, RTLIB::__hexagon_umodsi3);
     setLibcallImpl(RTLIB::UREM_I64, RTLIB::__hexagon_umoddi3);
 
-    const bool FastMath = HexagonEnableFastMathRuntimeCalls;
-    // This is the only fast library function for sqrtd.
-    if (FastMath)
-      setLibcallImpl(RTLIB::SQRT_F64, RTLIB::__hexagon_fast2_sqrtdf2);
-
     // Prefix is: nothing  for "slow-math",
     //            "fast2_" for V5+ fast-math double-precision
     // (actually, keep fast-math and fast-math2 separate for now)
-    if (FastMath) {
-      setLibcallImpl(RTLIB::ADD_F64, RTLIB::__hexagon_fast_adddf3);
-      setLibcallImpl(RTLIB::SUB_F64, RTLIB::__hexagon_fast_subdf3);
-      setLibcallImpl(RTLIB::MUL_F64, RTLIB::__hexagon_fast_muldf3);
-      setLibcallImpl(RTLIB::DIV_F64, RTLIB::__hexagon_fast_divdf3);
-      setLibcallImpl(RTLIB::DIV_F32, RTLIB::__hexagon_fast_divsf3);
-    } else {
-      setLibcallImpl(RTLIB::ADD_F64, RTLIB::__hexagon_adddf3);
-      setLibcallImpl(RTLIB::SUB_F64, RTLIB::__hexagon_subdf3);
-      setLibcallImpl(RTLIB::MUL_F64, RTLIB::__hexagon_muldf3);
-      setLibcallImpl(RTLIB::DIV_F64, RTLIB::__hexagon_divdf3);
-      setLibcallImpl(RTLIB::DIV_F32, RTLIB::__hexagon_divsf3);
-    }
 
-    if (FastMath)
-      setLibcallImpl(RTLIB::SQRT_F32, RTLIB::__hexagon_fast2_sqrtf);
-    else
-      setLibcallImpl(RTLIB::SQRT_F32, RTLIB::__hexagon_sqrtf);
+    setLibcallImpl(RTLIB::FAST_ADD_F64, RTLIB::__hexagon_fast_adddf3);
+    setLibcallImpl(RTLIB::FAST_SUB_F64, RTLIB::__hexagon_fast_subdf3);
+    setLibcallImpl(RTLIB::FAST_MUL_F64, RTLIB::__hexagon_fast_muldf3);
+    setLibcallImpl(RTLIB::FAST_DIV_F64, RTLIB::__hexagon_fast_divdf3);
+    setLibcallImpl(RTLIB::FAST_DIV_F32, RTLIB::__hexagon_fast_divsf3);
+    setLibcallImpl(RTLIB::FAST_SQRT_F32, RTLIB::__hexagon_fast2_sqrtf);
+    // This is the only fast library function for sqrtd.
+    setLibcallImpl(RTLIB::FAST_SQRT_F64, RTLIB::__hexagon_fast2_sqrtdf2);
+
+    setLibcallImpl(RTLIB::ADD_F64, RTLIB::__hexagon_adddf3);
+    setLibcallImpl(RTLIB::SUB_F64, RTLIB::__hexagon_subdf3);
+    setLibcallImpl(RTLIB::MUL_F64, RTLIB::__hexagon_muldf3);
+    setLibcallImpl(RTLIB::DIV_F64, RTLIB::__hexagon_divdf3);
+    setLibcallImpl(RTLIB::DIV_F32, RTLIB::__hexagon_divsf3);
+    setLibcallImpl(RTLIB::SQRT_F32, RTLIB::__hexagon_sqrtf);
 
     setLibcallImpl(
         RTLIB::HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES,
diff --git a/llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll b/llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll
new file mode 100644
index 0000000000000..6bc60132d3e6a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/fast-math-libcalls.ll
@@ -0,0 +1,369 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+
+;---------------------------------------------------------------------
+; fast sqrt
+;---------------------------------------------------------------------
+
+define float @fast_sqrt_f32(float %x) {
+; CHECK-LABEL: fast_sqrt_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast2_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan ninf nsz afn float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define double @fast_sqrt_f64(double %x) {
+; CHECK-LABEL: fast_sqrt_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast2_sqrtdf2
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan ninf nsz afn double @llvm.sqrt.f64(double %x)
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fadd
+;---------------------------------------------------------------------
+
+define float @fast_add_f32(float %x, float %y) {
+; CHECK-LABEL: fast_add_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_add_f64(double %x, double %y) {
+; CHECK-LABEL: fast_add_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_adddf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fadd nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fsub
+;---------------------------------------------------------------------
+
+define float @fast_sub_f32(float %x, float %y) {
+; CHECK-LABEL: fast_sub_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfsub(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fsub nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_sub_f64(double %x, double %y) {
+; CHECK-LABEL: fast_sub_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_subdf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fsub nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fmul
+;---------------------------------------------------------------------
+
+define float @fast_mul_f32(float %x, float %y) {
+; CHECK-LABEL: fast_mul_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfmpy(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fmul nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_mul_f64(double %x, double %y) {
+; CHECK-LABEL: fast_mul_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_muldf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fmul nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; fast fdiv
+;---------------------------------------------------------------------
+
+define float @fast_div_f32(float %x, float %y) {
+; CHECK-LABEL: fast_div_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = sffixupn(r0,r1)
+; CHECK-NEXT:     r4,p0 = sfrecipa(r0,r1)
+; CHECK-NEXT:     r5 = ##1065353216
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = sffixupd(r0,r1)
+; CHECK-NEXT:     r6 = ##1065353216
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5 -= sfmpy(r1,r4):lib
+; CHECK-NEXT:     r0 = and(r2,##-2147483648)
+; CHECK-NEXT:     r3 = r2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r4 += sfmpy(r5,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 += sfmpy(r2,r4):lib
+; CHECK-NEXT:     r6 -= sfmpy(r1,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3 -= sfmpy(r1,r0):lib
+; CHECK-NEXT:     r4 += sfmpy(r6,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 += sfmpy(r3,r4):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 -= sfmpy(r0,r1):lib
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 += sfmpy(r2,r4,p0):scale
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fdiv nnan ninf nsz afn float %x, %y
+  ret float %result
+}
+
+define double @fast_div_f64(double %x, double %y) {
+; CHECK-LABEL: fast_div_f64:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_fast_divdf3
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = fdiv nnan ninf nsz afn double %x, %y
+  ret double %result
+}
+
+;---------------------------------------------------------------------
+; Negative tests sqrt
+;---------------------------------------------------------------------
+
+; TODO: What flags do we really need here?
+define float @sqrt_f32__afn(float %x) {
+; CHECK-LABEL: sqrt_f32__afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call afn float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32__afn_ninf(float %x) {
+; CHECK-LABEL: sqrt_f32__afn_ninf:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call afn ninf float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32__afn_nnan(float %x) {
+; CHECK-LABEL: sqrt_f32__afn_nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call afn nnan float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32__nnan(float %x) {
+; CHECK-LABEL: sqrt_f32__nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @sqrt_f32_nnan_ninf_afn(float %x) {
+; CHECK-LABEL: sqrt_f32_nnan_ninf_afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __hexagon_sqrtf
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call nnan ninf afn float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+;---------------------------------------------------------------------
+; Negative tests fadd
+;---------------------------------------------------------------------
+
+; TODO: What flags do we really need here?
+define float @fadd_f32_afn(float %x, float %y) {
+; CHECK-LABEL: fadd_f32_afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd afn float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__afn_ninf(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__afn_ninf:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd afn ninf float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__afn_nnan(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__afn_nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd afn nnan float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__nnan(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__nnan:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd nnan float %x, %y
+  ret float %result
+}
+
+define float @fadd_f32__nnan_ninf_afn(float %x, float %y) {
+; CHECK-LABEL: fadd_f32__nnan_ninf_afn:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = sfadd(r0,r1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %result = fadd nnan ninf afn float %x, %y
+  ret float %result
+}