Restore functionality of FastMath.sincos. (#1627)

maleadt · web-flow · commit 0cd30cbed3d0 · 2022-10-12T16:25:16.000+02:00
Base decided in JuliaLang/julia#24031 that FastMath.sincos should fall back to the native implementation in Julia, because it is faster than the intrinsics (for the CPU at least). That does not hold for CUDA GPUs, so have it again call sin_fast/cos_fast.
diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl
@@ -40,6 +40,9 @@ end
     ccall("extern __nv_sincosf", llvmcall, Cvoid, (Cfloat, Ptr{Cfloat}, Ptr{Cfloat}), x, s, c)
     return (s[], c[])
 end
+# Base has sincos_fast fall back to the native implementation which is presumed faster,
+# but that is not the case compared to CUDA's intrinsics
+@device_override FastMath.sincos_fast(x::Union{Float64,Float32}) = (FastMath.sin_fast(x), FastMath.cos_fast(x))
 
 @device_override function Base.sincospi(x::Float64)
     s = Ref{Cdouble}()
diff --git a/test/device/intrinsics/math.jl b/test/device/intrinsics/math.jl
@@ -136,4 +136,16 @@ using SpecialFunctions
             @test Array(a)[3] == r
         end
     end
+
+    @testset "@fastmath sincos" begin
+        # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
+        function kernel(a, b, c)
+            @inbounds b[], c[] = @fastmath sincos(a[])
+            return
+        end
+        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
+        @assert contains(asm, "sin.approx.f32")
+        @assert contains(asm, "cos.approx.f32")
+        @assert !contains(asm, "__nv")  # from libdevice
+    end
 end