[SYCL][libclc][CUDA] Add --ffast-math support (#5801)

pgorlani · web-flow · commit 0f0c5d19c52b · 2022-04-25T08:02:31.000-07:00
This patch allows the `--ffast-math` compiler
flag to substitute the regular `genfloatf` math
built-ins with their `::native` versions.

Moreover, this patch completes the support of natives
built-ins for `libclc/ptx-nvidiacl` connecting
them with the `__nv_fast` functions present in
libdevice. If a fast function is not available in
libdevice the corresponding `nvvm` intrinsic is used.
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -347,6 +347,8 @@ BUILTIN(__nvvm_rcp_rm_ftz_f, "ff", "")
 BUILTIN(__nvvm_rcp_rm_f, "ff", "")
 BUILTIN(__nvvm_rcp_rp_ftz_f, "ff", "")
 BUILTIN(__nvvm_rcp_rp_f, "ff", "")
+BUILTIN(__nvvm_rcp_approx_f, "ff", "")
+BUILTIN(__nvvm_rcp_approx_ftz_f, "ff", "")
 
 BUILTIN(__nvvm_rcp_rn_d, "dd", "")
 BUILTIN(__nvvm_rcp_rz_d, "dd", "")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
@@ -1546,7 +1546,7 @@ def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<
   MarshallingInfoEnum<LangOpts<"FPExceptionMode">, "FPE_Ignore">;
 defm fast_math : BoolFOption<"fast-math",
   LangOpts<"FastMath">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Allow aggressive, lossy floating-point optimizations",
+  PosFlag<SetTrue, [CC1Option, CoreOption], "Allow aggressive, lossy floating-point optimizations",
           [cl_fast_relaxed_math.KeyPath]>,
   NegFlag<SetFalse>>;
 def menable_unsafe_fp_math : Flag<["-"], "menable-unsafe-fp-math">, Flags<[CC1Option]>,
diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES
@@ -43,13 +43,15 @@ math/log2.cl
 math/logb.cl
 math/modf.cl
 math/native_cos.cl
+math/native_divide.cl
 math/native_exp.cl
 math/native_exp10.cl
 math/native_exp2.cl
 math/native_log.cl
 math/native_log10.cl
 math/native_log2.cl
 math/native_powr.cl
+math/native_recip.cl
 math/native_rsqrt.cl
 math/native_sin.cl
 math/native_sqrt.cl
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_divide.cl b/libclc/ptx-nvidiacl/libspirv/math/native_divide.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <spirv/spirv.h>
+
+#include "../../include/libdevice.h"
+#include <clcmacro.h>
+
+#define __CLC_FUNCTION __spirv_ocl_native_divide
+#define __CLC_BUILTIN __nv_fast_fdivide
+#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
+#define __FLOAT_ONLY
+#include <math/binary_builtin.inc>
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl b/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl
@@ -8,12 +8,17 @@
 
 #include <spirv/spirv.h>
 
-#include "../../include/libdevice.h"
 #include <clcmacro.h>
 
-#define __CLC_FUNCTION __spirv_ocl_native_exp2
-#define __CLC_BUILTIN __nv_exp2
-#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
+extern int __clc_nvvm_reflect_ftz();
+
+_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_native_exp2(float x) {
+  return (__clc_nvvm_reflect_ftz()) ? __nvvm_ex2_approx_ftz_f(x)
+                                    : __nvvm_ex2_approx_f(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_native_exp2,
+                     float)
 
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
@@ -39,9 +44,3 @@ _CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __clc_native_exp2,
 #undef __USE_HALF_EXP2_APPROX
 
 #endif // cl_khr_fp16
-
-// Undef halfs before uncluding unary builtins, as they are handled above.
-#ifdef cl_khr_fp16
-#undef cl_khr_fp16
-#endif // cl_khr_fp16
-#include <math/unary_builtin.inc>
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_recip.cl b/libclc/ptx-nvidiacl/libspirv/math/native_recip.cl
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <spirv/spirv.h>
+
+#include <clcmacro.h>
+
+extern int __clc_nvvm_reflect_ftz();
+
+_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_native_recip(float x) {
+  return (__clc_nvvm_reflect_ftz()) ? __nvvm_rcp_approx_ftz_f(x)
+                                    : __nvvm_rcp_approx_f(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_native_recip,
+                     float)
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_rsqrt.cl b/libclc/ptx-nvidiacl/libspirv/math/native_rsqrt.cl
@@ -8,10 +8,14 @@
 
 #include <spirv/spirv.h>
 
-#include "../../include/libdevice.h"
 #include <clcmacro.h>
 
-#define __CLC_FUNCTION __spirv_ocl_native_rsqrt
-#define __CLC_BUILTIN __nv_rsqrt
-#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
-#include <math/unary_builtin.inc>
+extern int __clc_nvvm_reflect_ftz();
+
+_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_native_rsqrt(float x) {
+  return (__clc_nvvm_reflect_ftz()) ? __nvvm_rsqrt_approx_ftz_f(x)
+                                    : __nvvm_rsqrt_approx_f(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_native_rsqrt,
+                     float)
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_sin.cl b/libclc/ptx-nvidiacl/libspirv/math/native_sin.cl
@@ -12,6 +12,7 @@
 #include <clcmacro.h>
 
 #define __CLC_FUNCTION __spirv_ocl_native_sin
-#define __CLC_BUILTIN __nv_sin
+#define __CLC_BUILTIN __nv_fast_sin
 #define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
+#define __FLOAT_ONLY
 #include <math/unary_builtin.inc>
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_sqrt.cl b/libclc/ptx-nvidiacl/libspirv/math/native_sqrt.cl
@@ -8,10 +8,14 @@
 
 #include <spirv/spirv.h>
 
-#include "../../include/libdevice.h"
 #include <clcmacro.h>
 
-#define __CLC_FUNCTION __spirv_ocl_native_sqrt
-#define __CLC_BUILTIN __nv_sqrt
-#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
-#include <math/unary_builtin.inc>
+extern int __clc_nvvm_reflect_ftz();
+
+_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_native_sqrt(float x) {
+  return (__clc_nvvm_reflect_ftz()) ? __nvvm_sqrt_approx_ftz_f(x)
+                                    : __nvvm_sqrt_approx_f(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_native_sqrt,
+                     float)
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_tan.cl b/libclc/ptx-nvidiacl/libspirv/math/native_tan.cl
@@ -12,6 +12,7 @@
 #include <clcmacro.h>
 
 #define __CLC_FUNCTION __spirv_ocl_native_tan
-#define __CLC_BUILTIN __nv_tan
+#define __CLC_BUILTIN __nv_fast_tan
 #define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
+#define __FLOAT_ONLY
 #include <math/unary_builtin.inc>
diff --git a/libclc/ptx-nvidiacl/libspirv/reflect.ll b/libclc/ptx-nvidiacl/libspirv/reflect.ll
@@ -6,3 +6,10 @@ define i32 @__clc_nvvm_reflect_arch() alwaysinline {
   %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([12 x i8], [12 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
   ret i32 %reflect
 }
+
+@str_ftz = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
+
+define i32 @__clc_nvvm_reflect_ftz() alwaysinline {
+  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @str_ftz, i32 0, i32 0) to i8*))
+  ret i32 %reflect
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -935,6 +935,11 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_rcp_rp_f : GCCBuiltin<"__nvvm_rcp_rp_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
+  def int_nvvm_rcp_approx_f : GCCBuiltin<"__nvvm_rcp_approx_f">,
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_approx_ftz_f : GCCBuiltin<"__nvvm_rcp_approx_ftz_f">,
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
   def int_nvvm_rcp_rn_d : GCCBuiltin<"__nvvm_rcp_rn_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rz_d : GCCBuiltin<"__nvvm_rcp_rz_d">,
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1036,6 +1036,11 @@ def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
 def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
 
+def INT_NVVM_RCP_APPROX_F : F_MATH_1<"rcp.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_approx_f>;
+def INT_NVVM_RCP_APPROX_FTZ_F : F_MATH_1<"rcp.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_approx_ftz_f>;
+
 def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_rcp_rn_d>;
 def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp
@@ -25,6 +25,13 @@ namespace sycl {
 namespace __sycl_std = __host_std;
 #endif
 
+#ifdef __FAST_MATH__
+#define __FAST_MATH_GENFLOAT(T)                                                \
+  (detail::is_genfloatd<T>::value || detail::is_genfloath<T>::value)
+#else
+#define __FAST_MATH_GENFLOAT(T) (detail::is_genfloat<T>::value)
+#endif
+
 /* ----------------- 4.13.3 Math functions. ---------------------------------*/
 // genfloat acos (genfloat x)
 template <typename T>
@@ -114,7 +121,7 @@ detail::enable_if_t<detail::is_genfloat<T>::value, T> copysign(T x,
 
 // genfloat cos (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> cos(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> cos(T x) __NOEXC {
   return __sycl_std::__invoke_cos<T>(x);
 }
 
@@ -144,19 +151,19 @@ detail::enable_if_t<detail::is_genfloat<T>::value, T> erf(T x) __NOEXC {
 
 // genfloat exp (genfloat x )
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> exp(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> exp(T x) __NOEXC {
   return __sycl_std::__invoke_exp<T>(x);
 }
 
 // genfloat exp2 (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> exp2(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> exp2(T x) __NOEXC {
   return __sycl_std::__invoke_exp2<T>(x);
 }
 
 // genfloat exp10 (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> exp10(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> exp10(T x) __NOEXC {
   return __sycl_std::__invoke_exp10<T>(x);
 }
 
@@ -296,19 +303,19 @@ lgamma_r(T x, T2 signp) __NOEXC {
 
 // genfloat log (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> log(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> log(T x) __NOEXC {
   return __sycl_std::__invoke_log<T>(x);
 }
 
 // genfloat log2 (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> log2(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> log2(T x) __NOEXC {
   return __sycl_std::__invoke_log2<T>(x);
 }
 
 // genfloat log10 (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> log10(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> log10(T x) __NOEXC {
   return __sycl_std::__invoke_log10<T>(x);
 }
 
@@ -383,7 +390,7 @@ pown(T x, T2 y) __NOEXC {
 
 // genfloat powr (genfloat x, genfloat y)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> powr(T x, T y) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> powr(T x, T y) __NOEXC {
   return __sycl_std::__invoke_powr<T>(x, y);
 }
 
@@ -426,13 +433,13 @@ detail::enable_if_t<detail::is_genfloat<T>::value, T> round(T x) __NOEXC {
 
 // genfloat rsqrt (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> rsqrt(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> rsqrt(T x) __NOEXC {
   return __sycl_std::__invoke_rsqrt<T>(x);
 }
 
 // genfloat sin (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> sin(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sin(T x) __NOEXC {
   return __sycl_std::__invoke_sin<T>(x);
 }
 
@@ -459,13 +466,13 @@ detail::enable_if_t<detail::is_genfloat<T>::value, T> sinpi(T x) __NOEXC {
 
 // genfloat sqrt (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> sqrt(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sqrt(T x) __NOEXC {
   return __sycl_std::__invoke_sqrt<T>(x);
 }
 
 // genfloat tan (genfloat x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> tan(T x) __NOEXC {
+detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> tan(T x) __NOEXC {
   return __sycl_std::__invoke_tan<T>(x);
 }
 
@@ -1561,6 +1568,82 @@ detail::enable_if_t<detail::is_genfloatf<T>::value, T> tan(T x) __NOEXC {
 }
 
 } // namespace half_precision
+
+#ifdef __FAST_MATH__
+/* ----------------- -ffast-math functions. ---------------------------------*/
+// genfloatf cos (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> cos(T x) __NOEXC {
+  return native::cos(x);
+}
+
+// genfloatf exp (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> exp(T x) __NOEXC {
+  return native::exp(x);
+}
+
+// genfloatf exp2 (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> exp2(T x) __NOEXC {
+  return native::exp2(x);
+}
+
+// genfloatf exp10 (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> exp10(T x) __NOEXC {
+  return native::exp10(x);
+}
+
+// genfloatf log(genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> log(T x) __NOEXC {
+  return native::log(x);
+}
+
+// genfloatf log2 (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> log2(T x) __NOEXC {
+  return native::log2(x);
+}
+
+// genfloatf log10 (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> log10(T x) __NOEXC {
+  return native::log10(x);
+}
+
+// genfloatf powr (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> powr(T x, T y) __NOEXC {
+  return native::powr(x, y);
+}
+
+// genfloatf rsqrt (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> rsqrt(T x) __NOEXC {
+  return native::rsqrt(x);
+}
+
+// genfloatf sin (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> sin(T x) __NOEXC {
+  return native::sin(x);
+}
+
+// genfloatf sqrt (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> sqrt(T x) __NOEXC {
+  return native::sqrt(x);
+}
+
+// genfloatf tan (genfloatf x)
+template <typename T>
+detail::enable_if_t<detail::is_genfloat<T>::value, T> tan(T x) __NOEXC {
+  return native::tan(x);
+}
+
+#endif // __FAST_MATH__
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)