diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 1dfe070ff1..fe39be9801 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -57,7 +57,7 @@ steps:
   - label: "CUDA 11.2"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.6
+          version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           test_args: "--thorough"
       - JuliaCI/julia-coverage#v1:
@@ -79,7 +79,7 @@ steps:
   - label: "CUDA 11.1"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.6
+          version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           test_args: "--thorough"
       - JuliaCI/julia-coverage#v1:
@@ -101,7 +101,7 @@ steps:
   - label: "CUDA 11.0"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.6
+          version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           test_args: "--thorough"
       - JuliaCI/julia-coverage#v1:
@@ -123,7 +123,7 @@ steps:
   - label: "CUDA 10.2"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.6
+          version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           test_args: "--thorough"
       - JuliaCI/julia-coverage#v1:
@@ -145,7 +145,7 @@ steps:
   - label: "CUDA 10.1"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.6
+          version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           test_args: "--thorough"
       - JuliaCI/julia-coverage#v1:
diff --git a/perf/kernel.jl b/perf/kernel.jl
index e4fbf82769..a5ae6b8822 100644
--- a/perf/kernel.jl
+++ b/perf/kernel.jl
@@ -26,3 +26,10 @@ function checked_indexing_kernel(dest, src)
     return
 end
 group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
+
+function rand_kernel(dest::AbstractArray{T}) where {T}
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    dest[i] = rand(T)
+    return
+end
+group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
diff --git a/src/CUDA.jl b/src/CUDA.jl
index fab870d839..0288600332 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -57,6 +57,7 @@ include("device/intrinsics.jl")
 include("device/llvm.jl")
 include("device/runtime.jl")
 include("device/texture.jl")
+include("device/random.jl")
 
 # array essentials
 include("pool.jl")
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 4074aa77ac..a9b6fae4b6 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -200,15 +200,13 @@ end
 
 ## host-side kernels
 
-mutable struct HostKernel{F,TT} <: AbstractKernel{F,TT}
+struct HostKernel{F,TT} <: AbstractKernel{F,TT}
     ctx::CuContext
     mod::CuModule
     fun::CuFunction
 
-    random_state::Union{Nothing,Missing,CuVector{UInt32}}
-
-    function HostKernel{F,TT}(ctx::CuContext, mod::CuModule, fun::CuFunction, random_state) where {F,TT}
-        kernel = new{F,TT}(ctx, mod, fun, random_state)
+    function HostKernel{F,TT}(ctx::CuContext, mod::CuModule, fun::CuFunction) where {F,TT}
+        kernel = new{F,TT}(ctx, mod, fun)
     end
 end
 
@@ -358,19 +356,17 @@ function cufunction_link(@nospecialize(job::CompilerJob), compiled)
         filter!(!isequal("exception_flag"), compiled.external_gvars)
     end
 
-    random_state = nothing
-    if "global_random_state" in compiled.external_gvars
+    # initialize random seeds, if used
+    if "global_random_seed" in compiled.external_gvars
         random_state = missing
-        filter!(!isequal("global_random_state"), compiled.external_gvars)
+        initialize_random_seeds!(mod)
+        filter!(!isequal("global_random_seed"), compiled.external_gvars)
     end
 
-    return HostKernel{job.source.f,job.source.tt}(ctx, mod, fun, random_state)
+    return HostKernel{job.source.f,job.source.tt}(ctx, mod, fun)
 end
 
 function (kernel::HostKernel)(args...; threads::CuDim=1, blocks::CuDim=1, kwargs...)
-    if kernel.random_state !== nothing
-        init_random_state!(kernel, prod(threads) * prod(blocks))
-    end
     call(kernel, map(cudaconvert, args)...; threads, blocks, kwargs...)
 end
 
diff --git a/src/device/intrinsics.jl b/src/device/intrinsics.jl
index 211f9176cf..0e0814b571 100644
--- a/src/device/intrinsics.jl
+++ b/src/device/intrinsics.jl
@@ -41,7 +41,6 @@ include("intrinsics/memory_dynamic.jl")
 include("intrinsics/atomics.jl")
 include("intrinsics/misc.jl")
 include("intrinsics/wmma.jl")
-include("intrinsics/random.jl")
 
 # functionality from libdevice
 #
diff --git a/src/device/intrinsics/random.jl b/src/device/intrinsics/random.jl
deleted file mode 100644
index 2bae5ada3d..0000000000
--- a/src/device/intrinsics/random.jl
+++ /dev/null
@@ -1,71 +0,0 @@
-## random number generation
-
-using Random
-import RandomNumbers
-
-
-# helpers
-
-global_index() = (threadIdx().x, threadIdx().y, threadIdx().z,
-                  blockIdx().x, blockIdx().y, blockIdx().z)
-
-
-# global state
-
-struct ThreadLocalXorshift32 <: RandomNumbers.AbstractRNG{UInt32}
-    vals::CuDeviceArray{UInt32, 6, AS.Generic}
-end
-
-function init_random_state!(kernel, len)
-    if kernel.random_state === missing || length(kernel.random_state) < len
-        kernel.random_state = CuVector{UInt32}(undef, len)
-    end
-
-    random_state_ptr = CuGlobal{Ptr{Cvoid}}(kernel.mod, "global_random_state")
-    random_state_ptr[] = reinterpret(Ptr{Cvoid}, pointer(kernel.random_state))
-end
-
-@eval @inline function global_random_state()
-    ptr = reinterpret(LLVMPtr{UInt32, AS.Generic}, Base.llvmcall(
-        $("""@global_random_state = weak externally_initialized global i$(WORD_SIZE) 0
-             define i$(WORD_SIZE) @entry() #0 {
-                 %ptr = load i$(WORD_SIZE), i$(WORD_SIZE)* @global_random_state, align 8
-                 ret i$(WORD_SIZE) %ptr
-             }
-             attributes #0 = { alwaysinline }
-          """, "entry"), Ptr{Cvoid}, Tuple{}))
-    dims = (blockDim().x, blockDim().y, blockDim().z, gridDim().x, gridDim().y, gridDim().z)
-    CuDeviceArray(dims, ptr)
-end
-
-@device_override Random.default_rng() = ThreadLocalXorshift32(global_random_state())
-
-@device_override Random.make_seed() = clock(UInt32)
-
-function Random.seed!(rng::ThreadLocalXorshift32, seed::Integer)
-    index = global_index()
-    rng.vals[index...] = seed % UInt32
-    return
-end
-
-
-# generators
-
-function xorshift(x::UInt32)::UInt32
-    x = xor(x, x << 13)
-    x = xor(x, x >> 17)
-    x = xor(x, x << 5)
-    return x
-end
-
-function Random.rand(rng::ThreadLocalXorshift32, ::Type{UInt32})
-    # NOTE: we add the current linear index to the local state, to make sure threads get
-    #       different random numbers when unseeded (initial state = 0 for all threads)
-    index = global_index()
-    offset = LinearIndices(rng.vals)[index...]
-    state = rng.vals[index...] + UInt32(offset)
-
-    new_state = xorshift(state)
-    rng.vals[index...] = new_state
-    return new_state
-end
diff --git a/src/device/random.jl b/src/device/random.jl
new file mode 100644
index 0000000000..5a33fed979
--- /dev/null
+++ b/src/device/random.jl
@@ -0,0 +1,224 @@
+## random number generation
+
+using Random
+import RandomNumbers
+
+
+# global state
+
+# we provide 128 bytes of random state, shared across the thread block using shared memory.
+# the same amount of seeding data is available in constant memory, set during initial load.
+
+@eval @inline function global_random_seed()
+    ptr = Base.llvmcall(
+        $("""@global_random_seed = weak addrspace($(AS.Constant)) externally_initialized global [32 x i32] zeroinitializer, align 32
+             define i8 addrspace($(AS.Constant))* @entry() #0 {
+                 %ptr = getelementptr inbounds [32 x i32], [32 x i32] addrspace($(AS.Constant))* @global_random_seed, i64 0, i64 0
+                 %untyped_ptr = bitcast i32 addrspace($(AS.Constant))* %ptr to i8 addrspace($(AS.Constant))*
+                 ret i8 addrspace($(AS.Constant))* %untyped_ptr
+             }
+             attributes #0 = { alwaysinline }
+          """, "entry"), LLVMPtr{UInt32, AS.Constant}, Tuple{})
+    CuDeviceArray((32,), ptr)
+end
+
+@eval @inline function global_random_state()
+    ptr = Base.llvmcall(
+        $("""@global_random_state = weak addrspace($(AS.Shared)) global [32 x i32] zeroinitializer, align 32
+             define i8 addrspace($(AS.Shared))* @entry() #0 {
+                 %ptr = getelementptr inbounds [32 x i32], [32 x i32] addrspace($(AS.Shared))* @global_random_state, i64 0, i64 0
+                 %untyped_ptr = bitcast i32 addrspace($(AS.Shared))* %ptr to i8 addrspace($(AS.Shared))*
+                 ret i8 addrspace($(AS.Shared))* %untyped_ptr
+             }
+             attributes #0 = { alwaysinline }
+          """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
+    CuDeviceArray((32,), ptr)
+end
+
+function initialize_random_seeds!(mod)
+    seed = CuGlobal{NTuple{32,UInt32}}(mod, "global_random_seed")
+    seed[async=true] = Tuple(rand(UInt32, 32))
+end
+
+@device_override Random.make_seed() = clock(UInt32)
+
+@generated function make_full_seed(seed::Integer)
+    quote
+        s_0 = seed % UInt32
+        Base.@nexprs 32 i->s_i = xorshift(s_{i-1})
+        Base.@ncall 32 tuple s
+    end
+end
+
+
+# helpers
+
+function xorshift(x::UInt32)::UInt32
+    x = xor(x, x << 13)
+    x = xor(x, x >> 17)
+    x = xor(x, x << 5)
+    return x
+end
+
+
+# generators
+
+"""
+    SharedTauswortheGenerator()
+
+A maximally equidistributed combined Tausworthe generator.
+
+The generator uses 32 bytes of random state stored in shared memory. This memory is
+zero-initialized; When the first random number is generated, each block will derive an
+initial state from the seed provided during module compilation, and the block identifier.
+Each block will then indepentendly, but deterministically use that state to generate random
+numbers. Within a block, the 32-bytes of random state are identical, so the warp identifier
+will be mixed in to ensure uniqueness across warps. Finally, the first warp of each block
+updates the shared state.
+
+!!! warning
+
+    Although the numbers obtained from this generator "look OK", they do not pass the
+    SmallCrush test suite, so the generator should be deemed experimental.
+"""
+struct SharedTauswortheGenerator <: RandomNumbers.AbstractRNG{UInt32}
+end
+
+@inline function Base.getproperty(rng::SharedTauswortheGenerator, field::Symbol)
+    if field === :seed
+        global_random_seed()
+    elseif field === :state
+        global_random_state()
+    end
+end
+
+@device_override Random.default_rng() = SharedTauswortheGenerator()
+
+function Random.seed!(rng::SharedTauswortheGenerator, seed::Integer)
+    Random.seed!(rng, make_full_seed(seed))
+    return
+end
+
+"""
+    Random.seed!(rng::SharedTauswortheGenerator, seeds)
+
+Seed the on-device Tausworthe generator with a 32-element tuple or array of UInt32 seeds.
+This function only needs to be called by a single warp (i.e. 32 threads) from each block,
+but it does not hurt to call it from all threads (as long as threads are synchronized after
+that). It should always be called by at least 32 threads to ensure the random state is fully
+initialized, even if you will be using the generator from fewer threads!
+"""
+@inline Base.@propagate_inbounds function Random.seed!(rng::SharedTauswortheGenerator, seed)
+    state = initial_state(seed)
+    @inbounds rng.state[laneid()] = state
+    return
+end
+
+@inline Base.@propagate_inbounds function initial_state(seeds)
+    z = seeds[laneid()]
+
+    # mix-in the block id to ensure unique values across blocks
+    # XXX: is this OK? shouldn't we use a generator that allows skipping ahead?
+    #      https://stackoverflow.com/questions/11692785/efficient-xorshift-skip-ahead
+    blockId = blockIdx().x + (blockIdx().y - 1) * gridDim().x +
+                             (blockIdx().z - 1) * gridDim().x * gridDim().y
+    z = xorshift(z ⊻ blockId%UInt32)
+
+    return z
+end
+
+# NOTE: @propagate_inbounds for when we know the passed seed contains at least 32 elements
+# XXX: without @inline @propagate_inbounds, we get local memory accesses (e.g. with `rand!`)
+
+#const TausShift1 = CuConstantMemory{UInt32}((6, 2, 13, 3))
+#const TausShift2 = CuConstantMemory{UInt32}((13, 27, 21, 12))
+#const TausShift3 = CuConstantMemory{UInt32}((18, 2, 7, 13))
+#const TausOffset = CuConstantMemory{UInt32}((4294967294, 4294967288, 4294967280, 4294967168))
+# XXX: constant memory isn't supported yet, so let's resort to llvmcall
+for (name, vals) in ("TausShift1" => (6, 2, 13, 3),
+                     "TausShift2" => (13, 27, 21, 12),
+                     "TausShift3" => (18, 2, 7, 13),
+                     "TausOffset" => (4294967294, 4294967288, 4294967280, 4294967168))
+    @eval @inline function $(Symbol(name))()
+        ptr = Base.llvmcall(
+            $("""@$(name) = weak addrspace($(AS.Constant)) global [4 x i32] [i32 $(vals[1]), i32 $(vals[2]), i32 $(vals[3]), i32 $(vals[4])], align 4
+                define i8 addrspace($(AS.Constant))* @entry() #0 {
+                    %ptr = getelementptr inbounds [4 x i32], [4 x i32] addrspace($(AS.Constant))* @$(name), i64 0, i64 0
+                    %untyped_ptr = bitcast i32 addrspace($(AS.Constant))* %ptr to i8 addrspace($(AS.Constant))*
+                    ret i8 addrspace($(AS.Constant))* %untyped_ptr
+                }
+                attributes #0 = { alwaysinline }
+            """, "entry"), LLVMPtr{UInt32, AS.Constant}, Tuple{})
+        CuDeviceArray((4,), ptr)
+    end
+end
+
+function TausStep(z::Unsigned, S1::Integer, S2::Integer, S3::Integer, M::Unsigned)
+    b = (((z << S1) ⊻ z) >> S2)
+    return (((z & M) << S3) ⊻ b)
+end
+
+"""
+    Random.rand(rng::SharedTauswortheGenerator, UInt32)
+
+Generate a byte of random data using the on-device Tausworthe generator.
+
+!!! warning
+
+    This functions performs synchronization, so should only be called uniformly (i.e. by all
+    threads in a block). Do not call it from a branch that diverges within a block, or your
+    kernel may deadlock.
+"""
+function Random.rand(rng::SharedTauswortheGenerator, ::Type{UInt32})
+    @inline pow2_mod1(x, y) = (x-1)&(y-1) + 1
+
+    threadId = UInt32(threadIdx().x + (threadIdx().y - 1) * blockDim().x +
+                                      (threadIdx().z - 1) * blockDim().x * blockDim().y)
+    warpId = (threadId-UInt32(1)) >> 5 + UInt32(1)  # fld1
+    i = pow2_mod1(threadId, 32)
+    j = pow2_mod1(threadId, 4)
+
+    @inbounds begin
+        # get state
+        z = rng.state[i]
+        if z == 0
+            z = initial_state(rng.seed)
+        end
+
+        # mix-in the warp id to ensure unique values across blocks.
+        # we have max 1024 threads per block, so can safely shift by 16 bits.
+        # XXX: see comment in `initial_state`
+        z = xorshift(z ⊻ (warpId << 16))
+
+        sync_threads()
+
+        # advance & update state
+        S1, S2, S3, M = TausShift1()[j], TausShift2()[j], TausShift3()[j], TausOffset()[j]
+        state = TausStep(z, S1, S2, S3, M)
+        if warpId == 1
+            rng.state[i] = state
+        end
+
+        sync_threads()
+
+        # generate based on 4 bytes of state
+        mask = active_mask()
+        i1 = pow2_mod1(threadId+1, 32)
+        i2 = pow2_mod1(threadId+2, 32)
+        i3 = pow2_mod1(threadId+3, 32)
+        if active_mask() == typemax(UInt32)
+            # we have a full warp, so we can safely shuffle.
+            # get the warp-local states from neighbouring threads.
+            s1 = shfl_sync(FULL_MASK, state, i1)
+            s2 = shfl_sync(FULL_MASK, state, i2)
+            s3 = shfl_sync(FULL_MASK, state, i3)
+        else
+            # can't shuffle, so fall back to fetching block-global state.
+            # this results is less entropy, as it reuses data from the first warp.
+            s1 = rng.state[i1]
+            s2 = rng.state[i2]
+            s3 = rng.state[i3]
+        end
+        state ⊻ s1 ⊻ s2 ⊻ s3
+    end
+end
diff --git a/src/random.jl b/src/random.jl
index 4afbd98489..07416c5d09 100644
--- a/src/random.jl
+++ b/src/random.jl
@@ -4,6 +4,114 @@ using Random
 
 export rand_logn!, rand_poisson!
 
+
+# native RNG
+
+"""
+    CUDA.RNG()
+
+A random number generator using `rand()` in a device kernel.
+
+!!! warning
+
+    This generator is experimental, and not used by default yet. Currently, it does not
+    pass SmallCrush (but neither does the current GPUArrays.jl-based generator). If you
+    can, use a CURAND-based generator for now.
+
+See also: [SharedTauswortheGenerator](@ref)
+"""
+struct RNG <: AbstractRNG
+    state::CuVector{UInt32}
+
+    function RNG(seed)
+        @assert length(seed) == 32
+        new(seed)
+    end
+end
+
+RNG() = RNG(rand(UInt32, 32))
+
+function Random.seed!(rng::RNG, seed::AbstractVector{UInt32})
+    @assert length(seed) == 32
+    copyto!(rng.state, seed)
+end
+
+function Random.seed!(rng::RNG)
+    Random.rand!(rng.state)
+    return
+end
+
+function Random.rand!(rng::RNG, A::AnyCuArray)
+    function kernel(A::AbstractArray{T}, state::AbstractVector{UInt32}) where {T}
+        device_rng = Random.default_rng()
+
+        # initialize the state
+        tid = threadIdx().x
+        if tid <= 32
+            # we know for sure the seed will contain 32 values
+            @inbounds Random.seed!(device_rng, state)
+        end
+
+        sync_threads()
+
+        # grid-stride loop
+        offset = (blockIdx().x - 1) * blockDim().x
+        while offset < length(A)
+            # generating random numbers synchronizes threads, so needs to happen uniformly.
+            val = Random.rand(device_rng, T)
+
+            i = tid + offset
+            if i <= length(A)
+                @inbounds A[i] = val
+            end
+
+            offset += (blockDim().x - 1) * gridDim().x
+        end
+
+        sync_threads()
+
+        # save the device rng state of the first block (other blocks are derived from it)
+        # so that subsequent launches generate different random numbers
+        if blockIdx().x == 1 && tid <= 32
+            @inbounds state[tid] = device_rng.state[tid]
+        end
+
+        return
+    end
+
+    kernel = @cuda launch=false name="rand!" kernel(A, rng.state)
+    config = launch_configuration(kernel.fun; max_threads=64)
+    threads = max(32, min(config.threads, length(A)))
+    blocks = min(config.blocks, cld(length(A), threads))
+    kernel(A, rng.state; threads=threads, blocks=blocks)
+
+    # XXX: updating the state from within the kernel is racey,
+    #      so we should have a RNG per stream
+
+    A
+end
+
+# TODO: `randn!`; cannot reuse from Base or RandomNumbers, as those do scalar indexing
+
+
+# generic functionality
+
+function Random.rand!(rng::Union{RNG,CURAND.RNG,GPUArrays.RNG}, A::AbstractArray{T}) where {T}
+    B = CuArray{T}(undef, size(A))
+    Random.rand!(rng, B)
+    copyto!(A, B)
+end
+
+function Random.rand(rng::Union{RNG,CURAND.RNG,GPUArrays.RNG}, T::Type)
+    assertscalar("scalar rand")
+    A = CuArray{T}(undef, 1)
+    Random.rand!(rng, A)
+    A[]
+end
+
+
+# RNG-less interface
+
 # the interface is split in two levels:
 # - functions that extend the Random standard library, and take an RNG as first argument,
 #   will only ever dispatch to CURAND and as a result are limited in the types they support.
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
index 1337898b6c..a4b336a9fd 100644
--- a/test/device/intrinsics.jl
+++ b/test/device/intrinsics.jl
@@ -1184,52 +1184,3 @@ end
 end
 
 end
-
-
-
-############################################################################################
-
-@testset "random numbers" begin
-
-n = 256
-
-@testset "basic" begin
-    function kernel(A::CuDeviceArray{T}, B::CuDeviceArray{T}) where {T}
-        tid = threadIdx().x
-        A[tid] = rand(T)
-        B[tid] = rand(T)
-        return nothing
-    end
-
-    @testset for T in (Int32, UInt32, Int64, UInt64, Int128, UInt128,
-                       Float32, Float64)
-        a = CUDA.zeros(T, n)
-        b = CUDA.zeros(T, n)
-
-        @cuda threads=n kernel(a, b)
-
-        @test all(Array(a) .!= Array(b))
-    end
-end
-
-@testset "custom seed" begin
-    function kernel(A::CuDeviceArray{T}) where {T}
-        tid = threadIdx().x
-        Random.seed!(1234)
-        A[tid] = rand(T)
-        return nothing
-    end
-
-    @testset for T in (Int32, UInt32, Int64, UInt64, Int128, UInt128,
-                       Float32, Float64)
-        a = CUDA.zeros(T, n)
-        b = CUDA.zeros(T, n)
-
-        @cuda threads=n kernel(a)
-        @cuda threads=n kernel(b)
-
-        @test Array(a) == Array(b)
-    end
-end
-
-end
diff --git a/test/device/random.jl b/test/device/random.jl
new file mode 100644
index 0000000000..b026c5cba7
--- /dev/null
+++ b/test/device/random.jl
@@ -0,0 +1,81 @@
+using Random
+
+n = 256
+
+@testset "basic rand($T), seed $seed" for T in (Int32, UInt32, Int64, UInt64, Int128, UInt128,
+                                                Float32, Float64),
+                                          seed in (nothing, #=missing,=# 1234)
+
+    function apply_seed(seed)
+        if seed === missing
+            # should result in different numbers across launches
+            Random.seed!()
+            # XXX: this currently doesn't work, because of the definition in Base,
+            #      `seed!(r::MersenneTwister=default_rng())`, which breaks overriding
+            #      `default_rng` with a non-MersenneTwister RNG.
+        elseif seed !== nothing
+            # should result in the same numbers
+            Random.seed!(seed)
+        elseif seed === nothing
+            # should result in different numbers across launches,
+            # as determined by the seed set during module loading.
+        end
+    end
+
+    # different kernel invocations should get different numbers
+    @testset "across launches" begin
+        function kernel(A::AbstractArray{T}, seed) where {T}
+            apply_seed(seed)
+            tid = threadIdx().x
+            A[tid] = rand(T)
+            return nothing
+        end
+
+        a = CUDA.zeros(T, n)
+        b = CUDA.zeros(T, n)
+
+        @cuda threads=n kernel(a, seed)
+        @cuda threads=n kernel(b, seed)
+
+        if seed === nothing || seed === missing
+            @test all(Array(a) .!= Array(b))
+        else
+            @test Array(a) == Array(b)
+        end
+    end
+
+    # multiple calls to rand should get different numbers
+    @testset "across calls" begin
+        function kernel(A::AbstractArray{T}, B::AbstractArray{T}, seed) where {T}
+            apply_seed(seed)
+            tid = threadIdx().x
+            A[tid] = rand(T)
+            B[tid] = rand(T)
+            return nothing
+        end
+
+        a = CUDA.zeros(T, n)
+        b = CUDA.zeros(T, n)
+
+        @cuda threads=n kernel(a, b, seed)
+
+        @test all(Array(a) .!= Array(b))
+    end
+
+    # different threads should get different numbers
+    @testset "across threads" for active_dim in 1:6
+        function kernel(A::AbstractArray{T}, seed) where {T}
+            apply_seed(seed)
+            id = threadIdx().x*threadIdx().y*threadIdx().z*blockIdx().x*blockIdx().y*blockIdx().z
+            A[id] = rand(T)
+            return nothing
+        end
+
+        tx, ty, tz, bx, by, bz = [dim == active_dim ? 2 : 1 for dim in 1:6]
+        a = CUDA.zeros(T, 2)
+
+        @cuda threads=(tx, ty, tz) blocks=(bx, by, bz) kernel(a, seed)
+
+        @test Array(a)[1] != Array(a)[2]
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 89f016908c..94b4ce3312 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -171,6 +171,9 @@ if Sys.ARCH == :aarch64
     # CUFFT segfaults on ARM
     push!(skip_tests, "cufft")
 end
+if VERSION < v"1.6.1-"
+    push!(skip_tests, "device/random")
+end
 for (i, test) in enumerate(skip_tests)
     # we find tests by scanning the file system, so make sure the path separator matches
     skip_tests[i] = replace(test, '/'=>Base.Filesystem.path_separator)