diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1dfe070ff1..fe39be9801 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -57,7 +57,7 @@ steps: - label: "CUDA 11.2" plugins: - JuliaCI/julia#v1: - version: 1.6 + version: 1.6-nightly - JuliaCI/julia-test#v1: test_args: "--thorough" - JuliaCI/julia-coverage#v1: @@ -79,7 +79,7 @@ steps: - label: "CUDA 11.1" plugins: - JuliaCI/julia#v1: - version: 1.6 + version: 1.6-nightly - JuliaCI/julia-test#v1: test_args: "--thorough" - JuliaCI/julia-coverage#v1: @@ -101,7 +101,7 @@ steps: - label: "CUDA 11.0" plugins: - JuliaCI/julia#v1: - version: 1.6 + version: 1.6-nightly - JuliaCI/julia-test#v1: test_args: "--thorough" - JuliaCI/julia-coverage#v1: @@ -123,7 +123,7 @@ steps: - label: "CUDA 10.2" plugins: - JuliaCI/julia#v1: - version: 1.6 + version: 1.6-nightly - JuliaCI/julia-test#v1: test_args: "--thorough" - JuliaCI/julia-coverage#v1: @@ -145,7 +145,7 @@ steps: - label: "CUDA 10.1" plugins: - JuliaCI/julia#v1: - version: 1.6 + version: 1.6-nightly - JuliaCI/julia-test#v1: test_args: "--thorough" - JuliaCI/julia-coverage#v1: diff --git a/perf/kernel.jl b/perf/kernel.jl index e4fbf82769..a5ae6b8822 100644 --- a/perf/kernel.jl +++ b/perf/kernel.jl @@ -26,3 +26,10 @@ function checked_indexing_kernel(dest, src) return end group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src) + +function rand_kernel(dest::AbstractArray{T}) where {T} + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + dest[i] = rand(T) + return +end +group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest) diff --git a/src/CUDA.jl b/src/CUDA.jl index fab870d839..0288600332 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -57,6 +57,7 @@ include("device/intrinsics.jl") include("device/llvm.jl") include("device/runtime.jl") include("device/texture.jl") +include("device/random.jl") # array essentials include("pool.jl") diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 4074aa77ac..a9b6fae4b6 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -200,15 +200,13 @@ end ## host-side kernels -mutable struct HostKernel{F,TT} <: AbstractKernel{F,TT} +struct HostKernel{F,TT} <: AbstractKernel{F,TT} ctx::CuContext mod::CuModule fun::CuFunction - random_state::Union{Nothing,Missing,CuVector{UInt32}} - - function HostKernel{F,TT}(ctx::CuContext, mod::CuModule, fun::CuFunction, random_state) where {F,TT} - kernel = new{F,TT}(ctx, mod, fun, random_state) + function HostKernel{F,TT}(ctx::CuContext, mod::CuModule, fun::CuFunction) where {F,TT} + kernel = new{F,TT}(ctx, mod, fun) end end @@ -358,19 +356,17 @@ function cufunction_link(@nospecialize(job::CompilerJob), compiled) filter!(!isequal("exception_flag"), compiled.external_gvars) end - random_state = nothing - if "global_random_state" in compiled.external_gvars + # initialize random seeds, if used + if "global_random_seed" in compiled.external_gvars random_state = missing - filter!(!isequal("global_random_state"), compiled.external_gvars) + initialize_random_seeds!(mod) + filter!(!isequal("global_random_seed"), compiled.external_gvars) end - return HostKernel{job.source.f,job.source.tt}(ctx, mod, fun, random_state) + return HostKernel{job.source.f,job.source.tt}(ctx, mod, fun) end function (kernel::HostKernel)(args...; threads::CuDim=1, blocks::CuDim=1, kwargs...) - if kernel.random_state !== nothing - init_random_state!(kernel, prod(threads) * prod(blocks)) - end call(kernel, map(cudaconvert, args)...; threads, blocks, kwargs...) end diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl index 211f9176cf..0e0814b571 100644 --- a/src/device/intrinsics.jl +++ b/src/device/intrinsics.jl @@ -41,7 +41,6 @@ include("intrinsics/memory_dynamic.jl") include("intrinsics/atomics.jl") include("intrinsics/misc.jl") include("intrinsics/wmma.jl") -include("intrinsics/random.jl") # functionality from libdevice # diff --git a/src/device/intrinsics/random.jl b/src/device/intrinsics/random.jl deleted file mode 100644 index 2bae5ada3d..0000000000 --- a/src/device/intrinsics/random.jl +++ /dev/null @@ -1,71 +0,0 @@ -## random number generation - -using Random -import RandomNumbers - - -# helpers - -global_index() = (threadIdx().x, threadIdx().y, threadIdx().z, - blockIdx().x, blockIdx().y, blockIdx().z) - - -# global state - -struct ThreadLocalXorshift32 <: RandomNumbers.AbstractRNG{UInt32} - vals::CuDeviceArray{UInt32, 6, AS.Generic} -end - -function init_random_state!(kernel, len) - if kernel.random_state === missing || length(kernel.random_state) < len - kernel.random_state = CuVector{UInt32}(undef, len) - end - - random_state_ptr = CuGlobal{Ptr{Cvoid}}(kernel.mod, "global_random_state") - random_state_ptr[] = reinterpret(Ptr{Cvoid}, pointer(kernel.random_state)) -end - -@eval @inline function global_random_state() - ptr = reinterpret(LLVMPtr{UInt32, AS.Generic}, Base.llvmcall( - $("""@global_random_state = weak externally_initialized global i$(WORD_SIZE) 0 - define i$(WORD_SIZE) @entry() #0 { - %ptr = load i$(WORD_SIZE), i$(WORD_SIZE)* @global_random_state, align 8 - ret i$(WORD_SIZE) %ptr - } - attributes #0 = { alwaysinline } - """, "entry"), Ptr{Cvoid}, Tuple{})) - dims = (blockDim().x, blockDim().y, blockDim().z, gridDim().x, gridDim().y, gridDim().z) - CuDeviceArray(dims, ptr) -end - -@device_override Random.default_rng() = ThreadLocalXorshift32(global_random_state()) - -@device_override Random.make_seed() = clock(UInt32) - -function Random.seed!(rng::ThreadLocalXorshift32, seed::Integer) - index = global_index() - rng.vals[index...] = seed % UInt32 - return -end - - -# generators - -function xorshift(x::UInt32)::UInt32 - x = xor(x, x << 13) - x = xor(x, x >> 17) - x = xor(x, x << 5) - return x -end - -function Random.rand(rng::ThreadLocalXorshift32, ::Type{UInt32}) - # NOTE: we add the current linear index to the local state, to make sure threads get - # different random numbers when unseeded (initial state = 0 for all threads) - index = global_index() - offset = LinearIndices(rng.vals)[index...] - state = rng.vals[index...] + UInt32(offset) - - new_state = xorshift(state) - rng.vals[index...] = new_state - return new_state -end diff --git a/src/device/random.jl b/src/device/random.jl new file mode 100644 index 0000000000..5a33fed979 --- /dev/null +++ b/src/device/random.jl @@ -0,0 +1,224 @@ +## random number generation + +using Random +import RandomNumbers + + +# global state + +# we provide 128 bytes of random state, shared across the thread block using shared memory. +# the same amount of seeding data is available in constant memory, set during initial load. + +@eval @inline function global_random_seed() + ptr = Base.llvmcall( + $("""@global_random_seed = weak addrspace($(AS.Constant)) externally_initialized global [32 x i32] zeroinitializer, align 32 + define i8 addrspace($(AS.Constant))* @entry() #0 { + %ptr = getelementptr inbounds [32 x i32], [32 x i32] addrspace($(AS.Constant))* @global_random_seed, i64 0, i64 0 + %untyped_ptr = bitcast i32 addrspace($(AS.Constant))* %ptr to i8 addrspace($(AS.Constant))* + ret i8 addrspace($(AS.Constant))* %untyped_ptr + } + attributes #0 = { alwaysinline } + """, "entry"), LLVMPtr{UInt32, AS.Constant}, Tuple{}) + CuDeviceArray((32,), ptr) +end + +@eval @inline function global_random_state() + ptr = Base.llvmcall( + $("""@global_random_state = weak addrspace($(AS.Shared)) global [32 x i32] zeroinitializer, align 32 + define i8 addrspace($(AS.Shared))* @entry() #0 { + %ptr = getelementptr inbounds [32 x i32], [32 x i32] addrspace($(AS.Shared))* @global_random_state, i64 0, i64 0 + %untyped_ptr = bitcast i32 addrspace($(AS.Shared))* %ptr to i8 addrspace($(AS.Shared))* + ret i8 addrspace($(AS.Shared))* %untyped_ptr + } + attributes #0 = { alwaysinline } + """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{}) + CuDeviceArray((32,), ptr) +end + +function initialize_random_seeds!(mod) + seed = CuGlobal{NTuple{32,UInt32}}(mod, "global_random_seed") + seed[async=true] = Tuple(rand(UInt32, 32)) +end + +@device_override Random.make_seed() = clock(UInt32) + +@generated function make_full_seed(seed::Integer) + quote + s_0 = seed % UInt32 + Base.@nexprs 32 i->s_i = xorshift(s_{i-1}) + Base.@ncall 32 tuple s + end +end + + +# helpers + +function xorshift(x::UInt32)::UInt32 + x = xor(x, x << 13) + x = xor(x, x >> 17) + x = xor(x, x << 5) + return x +end + + +# generators + +""" + SharedTauswortheGenerator() + +A maximally equidistributed combined Tausworthe generator. + +The generator uses 32 bytes of random state stored in shared memory. This memory is +zero-initialized; When the first random number is generated, each block will derive an +initial state from the seed provided during module compilation, and the block identifier. +Each block will then indepentendly, but deterministically use that state to generate random +numbers. Within a block, the 32-bytes of random state are identical, so the warp identifier +will be mixed in to ensure uniqueness across warps. Finally, the first warp of each block +updates the shared state. + +!!! warning + + Although the numbers obtained from this generator "look OK", they do not pass the + SmallCrush test suite, so the generator should be deemed experimental. +""" +struct SharedTauswortheGenerator <: RandomNumbers.AbstractRNG{UInt32} +end + +@inline function Base.getproperty(rng::SharedTauswortheGenerator, field::Symbol) + if field === :seed + global_random_seed() + elseif field === :state + global_random_state() + end +end + +@device_override Random.default_rng() = SharedTauswortheGenerator() + +function Random.seed!(rng::SharedTauswortheGenerator, seed::Integer) + Random.seed!(rng, make_full_seed(seed)) + return +end + +""" + Random.seed!(rng::SharedTauswortheGenerator, seeds) + +Seed the on-device Tausworthe generator with a 32-element tuple or array of UInt32 seeds. +This function only needs to be called by a single warp (i.e. 32 threads) from each block, +but it does not hurt to call it from all threads (as long as threads are synchronized after +that). It should always be called by at least 32 threads to ensure the random state is fully +initialized, even if you will be using the generator from fewer threads! +""" +@inline Base.@propagate_inbounds function Random.seed!(rng::SharedTauswortheGenerator, seed) + state = initial_state(seed) + @inbounds rng.state[laneid()] = state + return +end + +@inline Base.@propagate_inbounds function initial_state(seeds) + z = seeds[laneid()] + + # mix-in the block id to ensure unique values across blocks + # XXX: is this OK? shouldn't we use a generator that allows skipping ahead? + # https://stackoverflow.com/questions/11692785/efficient-xorshift-skip-ahead + blockId = blockIdx().x + (blockIdx().y - 1) * gridDim().x + + (blockIdx().z - 1) * gridDim().x * gridDim().y + z = xorshift(z ⊻ blockId%UInt32) + + return z +end + +# NOTE: @propagate_inbounds for when we know the passed seed contains at least 32 elements +# XXX: without @inline @propagate_inbounds, we get local memory accesses (e.g. with `rand!`) + +#const TausShift1 = CuConstantMemory{UInt32}((6, 2, 13, 3)) +#const TausShift2 = CuConstantMemory{UInt32}((13, 27, 21, 12)) +#const TausShift3 = CuConstantMemory{UInt32}((18, 2, 7, 13)) +#const TausOffset = CuConstantMemory{UInt32}((4294967294, 4294967288, 4294967280, 4294967168)) +# XXX: constant memory isn't supported yet, so let's resort to llvmcall +for (name, vals) in ("TausShift1" => (6, 2, 13, 3), + "TausShift2" => (13, 27, 21, 12), + "TausShift3" => (18, 2, 7, 13), + "TausOffset" => (4294967294, 4294967288, 4294967280, 4294967168)) + @eval @inline function $(Symbol(name))() + ptr = Base.llvmcall( + $("""@$(name) = weak addrspace($(AS.Constant)) global [4 x i32] [i32 $(vals[1]), i32 $(vals[2]), i32 $(vals[3]), i32 $(vals[4])], align 4 + define i8 addrspace($(AS.Constant))* @entry() #0 { + %ptr = getelementptr inbounds [4 x i32], [4 x i32] addrspace($(AS.Constant))* @$(name), i64 0, i64 0 + %untyped_ptr = bitcast i32 addrspace($(AS.Constant))* %ptr to i8 addrspace($(AS.Constant))* + ret i8 addrspace($(AS.Constant))* %untyped_ptr + } + attributes #0 = { alwaysinline } + """, "entry"), LLVMPtr{UInt32, AS.Constant}, Tuple{}) + CuDeviceArray((4,), ptr) + end +end + +function TausStep(z::Unsigned, S1::Integer, S2::Integer, S3::Integer, M::Unsigned) + b = (((z << S1) ⊻ z) >> S2) + return (((z & M) << S3) ⊻ b) +end + +""" + Random.rand(rng::SharedTauswortheGenerator, UInt32) + +Generate a byte of random data using the on-device Tausworthe generator. + +!!! warning + + This functions performs synchronization, so should only be called uniformly (i.e. by all + threads in a block). Do not call it from a branch that diverges within a block, or your + kernel may deadlock. +""" +function Random.rand(rng::SharedTauswortheGenerator, ::Type{UInt32}) + @inline pow2_mod1(x, y) = (x-1)&(y-1) + 1 + + threadId = UInt32(threadIdx().x + (threadIdx().y - 1) * blockDim().x + + (threadIdx().z - 1) * blockDim().x * blockDim().y) + warpId = (threadId-UInt32(1)) >> 5 + UInt32(1) # fld1 + i = pow2_mod1(threadId, 32) + j = pow2_mod1(threadId, 4) + + @inbounds begin + # get state + z = rng.state[i] + if z == 0 + z = initial_state(rng.seed) + end + + # mix-in the warp id to ensure unique values across blocks. + # we have max 1024 threads per block, so can safely shift by 16 bits. + # XXX: see comment in `initial_state` + z = xorshift(z ⊻ (warpId << 16)) + + sync_threads() + + # advance & update state + S1, S2, S3, M = TausShift1()[j], TausShift2()[j], TausShift3()[j], TausOffset()[j] + state = TausStep(z, S1, S2, S3, M) + if warpId == 1 + rng.state[i] = state + end + + sync_threads() + + # generate based on 4 bytes of state + mask = active_mask() + i1 = pow2_mod1(threadId+1, 32) + i2 = pow2_mod1(threadId+2, 32) + i3 = pow2_mod1(threadId+3, 32) + if active_mask() == typemax(UInt32) + # we have a full warp, so we can safely shuffle. + # get the warp-local states from neighbouring threads. + s1 = shfl_sync(FULL_MASK, state, i1) + s2 = shfl_sync(FULL_MASK, state, i2) + s3 = shfl_sync(FULL_MASK, state, i3) + else + # can't shuffle, so fall back to fetching block-global state. + # this results is less entropy, as it reuses data from the first warp. + s1 = rng.state[i1] + s2 = rng.state[i2] + s3 = rng.state[i3] + end + state ⊻ s1 ⊻ s2 ⊻ s3 + end +end diff --git a/src/random.jl b/src/random.jl index 4afbd98489..07416c5d09 100644 --- a/src/random.jl +++ b/src/random.jl @@ -4,6 +4,114 @@ using Random export rand_logn!, rand_poisson! + +# native RNG + +""" + CUDA.RNG() + +A random number generator using `rand()` in a device kernel. + +!!! warning + + This generator is experimental, and not used by default yet. Currently, it does not + pass SmallCrush (but neither does the current GPUArrays.jl-based generator). If you + can, use a CURAND-based generator for now. + +See also: [SharedTauswortheGenerator](@ref) +""" +struct RNG <: AbstractRNG + state::CuVector{UInt32} + + function RNG(seed) + @assert length(seed) == 32 + new(seed) + end +end + +RNG() = RNG(rand(UInt32, 32)) + +function Random.seed!(rng::RNG, seed::AbstractVector{UInt32}) + @assert length(seed) == 32 + copyto!(rng.state, seed) +end + +function Random.seed!(rng::RNG) + Random.rand!(rng.state) + return +end + +function Random.rand!(rng::RNG, A::AnyCuArray) + function kernel(A::AbstractArray{T}, state::AbstractVector{UInt32}) where {T} + device_rng = Random.default_rng() + + # initialize the state + tid = threadIdx().x + if tid <= 32 + # we know for sure the seed will contain 32 values + @inbounds Random.seed!(device_rng, state) + end + + sync_threads() + + # grid-stride loop + offset = (blockIdx().x - 1) * blockDim().x + while offset < length(A) + # generating random numbers synchronizes threads, so needs to happen uniformly. + val = Random.rand(device_rng, T) + + i = tid + offset + if i <= length(A) + @inbounds A[i] = val + end + + offset += (blockDim().x - 1) * gridDim().x + end + + sync_threads() + + # save the device rng state of the first block (other blocks are derived from it) + # so that subsequent launches generate different random numbers + if blockIdx().x == 1 && tid <= 32 + @inbounds state[tid] = device_rng.state[tid] + end + + return + end + + kernel = @cuda launch=false name="rand!" kernel(A, rng.state) + config = launch_configuration(kernel.fun; max_threads=64) + threads = max(32, min(config.threads, length(A))) + blocks = min(config.blocks, cld(length(A), threads)) + kernel(A, rng.state; threads=threads, blocks=blocks) + + # XXX: updating the state from within the kernel is racey, + # so we should have a RNG per stream + + A +end + +# TODO: `randn!`; cannot reuse from Base or RandomNumbers, as those do scalar indexing + + +# generic functionality + +function Random.rand!(rng::Union{RNG,CURAND.RNG,GPUArrays.RNG}, A::AbstractArray{T}) where {T} + B = CuArray{T}(undef, size(A)) + Random.rand!(rng, B) + copyto!(A, B) +end + +function Random.rand(rng::Union{RNG,CURAND.RNG,GPUArrays.RNG}, T::Type) + assertscalar("scalar rand") + A = CuArray{T}(undef, 1) + Random.rand!(rng, A) + A[] +end + + +# RNG-less interface + # the interface is split in two levels: # - functions that extend the Random standard library, and take an RNG as first argument, # will only ever dispatch to CURAND and as a result are limited in the types they support. diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl index 1337898b6c..a4b336a9fd 100644 --- a/test/device/intrinsics.jl +++ b/test/device/intrinsics.jl @@ -1184,52 +1184,3 @@ end end end - - - -############################################################################################ - -@testset "random numbers" begin - -n = 256 - -@testset "basic" begin - function kernel(A::CuDeviceArray{T}, B::CuDeviceArray{T}) where {T} - tid = threadIdx().x - A[tid] = rand(T) - B[tid] = rand(T) - return nothing - end - - @testset for T in (Int32, UInt32, Int64, UInt64, Int128, UInt128, - Float32, Float64) - a = CUDA.zeros(T, n) - b = CUDA.zeros(T, n) - - @cuda threads=n kernel(a, b) - - @test all(Array(a) .!= Array(b)) - end -end - -@testset "custom seed" begin - function kernel(A::CuDeviceArray{T}) where {T} - tid = threadIdx().x - Random.seed!(1234) - A[tid] = rand(T) - return nothing - end - - @testset for T in (Int32, UInt32, Int64, UInt64, Int128, UInt128, - Float32, Float64) - a = CUDA.zeros(T, n) - b = CUDA.zeros(T, n) - - @cuda threads=n kernel(a) - @cuda threads=n kernel(b) - - @test Array(a) == Array(b) - end -end - -end diff --git a/test/device/random.jl b/test/device/random.jl new file mode 100644 index 0000000000..b026c5cba7 --- /dev/null +++ b/test/device/random.jl @@ -0,0 +1,81 @@ +using Random + +n = 256 + +@testset "basic rand($T), seed $seed" for T in (Int32, UInt32, Int64, UInt64, Int128, UInt128, + Float32, Float64), + seed in (nothing, #=missing,=# 1234) + + function apply_seed(seed) + if seed === missing + # should result in different numbers across launches + Random.seed!() + # XXX: this currently doesn't work, because of the definition in Base, + # `seed!(r::MersenneTwister=default_rng())`, which breaks overriding + # `default_rng` with a non-MersenneTwister RNG. + elseif seed !== nothing + # should result in the same numbers + Random.seed!(seed) + elseif seed === nothing + # should result in different numbers across launches, + # as determined by the seed set during module loading. + end + end + + # different kernel invocations should get different numbers + @testset "across launches" begin + function kernel(A::AbstractArray{T}, seed) where {T} + apply_seed(seed) + tid = threadIdx().x + A[tid] = rand(T) + return nothing + end + + a = CUDA.zeros(T, n) + b = CUDA.zeros(T, n) + + @cuda threads=n kernel(a, seed) + @cuda threads=n kernel(b, seed) + + if seed === nothing || seed === missing + @test all(Array(a) .!= Array(b)) + else + @test Array(a) == Array(b) + end + end + + # multiple calls to rand should get different numbers + @testset "across calls" begin + function kernel(A::AbstractArray{T}, B::AbstractArray{T}, seed) where {T} + apply_seed(seed) + tid = threadIdx().x + A[tid] = rand(T) + B[tid] = rand(T) + return nothing + end + + a = CUDA.zeros(T, n) + b = CUDA.zeros(T, n) + + @cuda threads=n kernel(a, b, seed) + + @test all(Array(a) .!= Array(b)) + end + + # different threads should get different numbers + @testset "across threads" for active_dim in 1:6 + function kernel(A::AbstractArray{T}, seed) where {T} + apply_seed(seed) + id = threadIdx().x*threadIdx().y*threadIdx().z*blockIdx().x*blockIdx().y*blockIdx().z + A[id] = rand(T) + return nothing + end + + tx, ty, tz, bx, by, bz = [dim == active_dim ? 2 : 1 for dim in 1:6] + a = CUDA.zeros(T, 2) + + @cuda threads=(tx, ty, tz) blocks=(bx, by, bz) kernel(a, seed) + + @test Array(a)[1] != Array(a)[2] + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 89f016908c..94b4ce3312 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -171,6 +171,9 @@ if Sys.ARCH == :aarch64 # CUFFT segfaults on ARM push!(skip_tests, "cufft") end +if VERSION < v"1.6.1-" + push!(skip_tests, "device/random") +end for (i, test) in enumerate(skip_tests) # we find tests by scanning the file system, so make sure the path separator matches skip_tests[i] = replace(test, '/'=>Base.Filesystem.path_separator)