Skip to content

Commit 03b4c39

Browse files
authored
Merge pull request #788 from JuliaGPU/tb/speedup_rand
Speed-up rand: Tausworthe RNG with shared random state.
2 parents 4721f60 + a3792e6 commit 03b4c39

File tree

11 files changed

+437
-138
lines changed

11 files changed

+437
-138
lines changed

.buildkite/pipeline.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ steps:
5757
- label: "CUDA 11.2"
5858
plugins:
5959
- JuliaCI/julia#v1:
60-
version: 1.6
60+
version: 1.6-nightly
6161
- JuliaCI/julia-test#v1:
6262
test_args: "--thorough"
6363
- JuliaCI/julia-coverage#v1:
@@ -79,7 +79,7 @@ steps:
7979
- label: "CUDA 11.1"
8080
plugins:
8181
- JuliaCI/julia#v1:
82-
version: 1.6
82+
version: 1.6-nightly
8383
- JuliaCI/julia-test#v1:
8484
test_args: "--thorough"
8585
- JuliaCI/julia-coverage#v1:
@@ -101,7 +101,7 @@ steps:
101101
- label: "CUDA 11.0"
102102
plugins:
103103
- JuliaCI/julia#v1:
104-
version: 1.6
104+
version: 1.6-nightly
105105
- JuliaCI/julia-test#v1:
106106
test_args: "--thorough"
107107
- JuliaCI/julia-coverage#v1:
@@ -123,7 +123,7 @@ steps:
123123
- label: "CUDA 10.2"
124124
plugins:
125125
- JuliaCI/julia#v1:
126-
version: 1.6
126+
version: 1.6-nightly
127127
- JuliaCI/julia-test#v1:
128128
test_args: "--thorough"
129129
- JuliaCI/julia-coverage#v1:
@@ -145,7 +145,7 @@ steps:
145145
- label: "CUDA 10.1"
146146
plugins:
147147
- JuliaCI/julia#v1:
148-
version: 1.6
148+
version: 1.6-nightly
149149
- JuliaCI/julia-test#v1:
150150
test_args: "--thorough"
151151
- JuliaCI/julia-coverage#v1:

perf/kernel.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,10 @@ function checked_indexing_kernel(dest, src)
2626
return
2727
end
2828
group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
29+
30+
function rand_kernel(dest::AbstractArray{T}) where {T}
31+
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
32+
dest[i] = rand(T)
33+
return
34+
end
35+
group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)

src/CUDA.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ include("device/intrinsics.jl")
5757
include("device/llvm.jl")
5858
include("device/runtime.jl")
5959
include("device/texture.jl")
60+
include("device/random.jl")
6061

6162
# array essentials
6263
include("pool.jl")

src/compiler/execution.jl

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,13 @@ end
200200

201201
## host-side kernels
202202

203-
mutable struct HostKernel{F,TT} <: AbstractKernel{F,TT}
203+
struct HostKernel{F,TT} <: AbstractKernel{F,TT}
204204
ctx::CuContext
205205
mod::CuModule
206206
fun::CuFunction
207207

208-
random_state::Union{Nothing,Missing,CuVector{UInt32}}
209-
210-
function HostKernel{F,TT}(ctx::CuContext, mod::CuModule, fun::CuFunction, random_state) where {F,TT}
211-
kernel = new{F,TT}(ctx, mod, fun, random_state)
208+
function HostKernel{F,TT}(ctx::CuContext, mod::CuModule, fun::CuFunction) where {F,TT}
209+
kernel = new{F,TT}(ctx, mod, fun)
212210
end
213211
end
214212

@@ -358,19 +356,17 @@ function cufunction_link(@nospecialize(job::CompilerJob), compiled)
358356
filter!(!isequal("exception_flag"), compiled.external_gvars)
359357
end
360358

361-
random_state = nothing
362-
if "global_random_state" in compiled.external_gvars
359+
# initialize random seeds, if used
360+
if "global_random_seed" in compiled.external_gvars
363361
random_state = missing
364-
filter!(!isequal("global_random_state"), compiled.external_gvars)
362+
initialize_random_seeds!(mod)
363+
filter!(!isequal("global_random_seed"), compiled.external_gvars)
365364
end
366365

367-
return HostKernel{job.source.f,job.source.tt}(ctx, mod, fun, random_state)
366+
return HostKernel{job.source.f,job.source.tt}(ctx, mod, fun)
368367
end
369368

370369
function (kernel::HostKernel)(args...; threads::CuDim=1, blocks::CuDim=1, kwargs...)
371-
if kernel.random_state !== nothing
372-
init_random_state!(kernel, prod(threads) * prod(blocks))
373-
end
374370
call(kernel, map(cudaconvert, args)...; threads, blocks, kwargs...)
375371
end
376372

src/device/intrinsics.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ include("intrinsics/memory_dynamic.jl")
4141
include("intrinsics/atomics.jl")
4242
include("intrinsics/misc.jl")
4343
include("intrinsics/wmma.jl")
44-
include("intrinsics/random.jl")
4544

4645
# functionality from libdevice
4746
#

src/device/intrinsics/random.jl

Lines changed: 0 additions & 71 deletions
This file was deleted.

0 commit comments

Comments
 (0)