Skip to content

Commit 0cd30cb

Browse files
authored
Restore functionality of FastMath.sincos. (#1627)
Base decided in JuliaLang/julia#24031 that FastMath.sincos should fall back to the native implementation in Julia, because it is faster than the intrinsics (for the CPU at least). That does not hold for CUDA GPUs, so have it again call sin_fast/cos_fast.
1 parent 8a4cbde commit 0cd30cb

File tree

2 files changed

+15
-0
lines changed

2 files changed

+15
-0
lines changed

src/device/intrinsics/math.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ end
4040
ccall("extern __nv_sincosf", llvmcall, Cvoid, (Cfloat, Ptr{Cfloat}, Ptr{Cfloat}), x, s, c)
4141
return (s[], c[])
4242
end
43+
# Base has sincos_fast fall back to the native implementation which is presumed faster,
44+
# but that is not the case compared to CUDA's intrinsics
45+
@device_override FastMath.sincos_fast(x::Union{Float64,Float32}) = (FastMath.sin_fast(x), FastMath.cos_fast(x))
4346

4447
@device_override function Base.sincospi(x::Float64)
4548
s = Ref{Cdouble}()

test/device/intrinsics/math.jl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,4 +136,16 @@ using SpecialFunctions
136136
@test Array(a)[3] == r
137137
end
138138
end
139+
140+
@testset "@fastmath sincos" begin
141+
# JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
142+
function kernel(a, b, c)
143+
@inbounds b[], c[] = @fastmath sincos(a[])
144+
return
145+
end
146+
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
147+
@assert contains(asm, "sin.approx.f32")
148+
@assert contains(asm, "cos.approx.f32")
149+
@assert !contains(asm, "__nv") # from libdevice
150+
end
139151
end

0 commit comments

Comments
 (0)