From ccf6f21ebf5f4820e9c1ed34eaf9bf80097bb57a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 12 Feb 2023 20:12:25 -0500 Subject: [PATCH] Transition GPUArrays to KernelAbstractions --- Manifest.toml | 129 +++++++++++++++++++++++++++++++------ Project.toml | 1 + src/GPUArrays.jl | 1 + src/device/execution.jl | 114 ++------------------------------ src/host/abstractarray.jl | 28 +++----- src/host/base.jl | 20 +++--- src/host/construction.jl | 21 +++--- src/host/random.jl | 21 +++--- src/host/uniformscaling.jl | 48 ++++++++------ test/runtests.jl | 20 +++--- 10 files changed, 200 insertions(+), 203 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 1f40f4dd..0c9b8e23 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,35 +2,51 @@ [[Adapt]] deps = ["LinearAlgebra"] -git-tree-sha1 = "af92965fb30777147966f58acb05da51c5616b5f" +git-tree-sha1 = "0310e08cb19f5da31d08341c6120c047598f5b9c" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.3.3" +version = "3.5.0" [[ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +[[Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" [[CEnum]] -git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.1" +version = "0.4.2" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.2+0" [[Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" [[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" + +[[FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" [[GPUArraysCore]] deps = ["Adapt"] -path = "lib/GPUArraysCore" +git-tree-sha1 = "57f7cde02d7a53c9d1d28443b9f11ac5fbe7ebc9" uuid = "46192b85-c4d5-4398-a991-12ede77f4527" version = "0.1.3" @@ -40,29 +56,49 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" [[JLLWrappers]] deps = ["Preferences"] -git-tree-sha1 = "22df5b96feef82434b07327e2d3c770a9b21e023" +git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.4.0" +version = "1.4.1" + +[[KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "817c259d256d0b8e4a8c7aaf281a303fd5db32f8" +repo-rev = "vc/nix_dependencies" +repo-url = "https://github.com/JuliaGPU/KernelAbstractions.jl.git" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.0" + + [KernelAbstractions.extensions] + CUDAKernels = "CUDA" + + [KernelAbstractions.weakdeps] + CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [[LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "f8dcd7adfda0dddaf944e62476d823164cccc217" +git-tree-sha1 = "df115c31f5c163697eede495918d8e85045c8f04" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.7.1" +version = "4.16.0" [[LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "67cc5406b15bd04ff72a45f628bec61d36078908" +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"] +git-tree-sha1 = "771bfe376249626d3ca12bcd58ba243d3f961576" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.13+3" +version = "0.0.16+0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" [[LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" [[LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] @@ -71,17 +107,24 @@ uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" [[LinearAlgebra]] -deps = ["Libdl"] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.10" + [[Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" @@ -89,22 +132,31 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.0+0" [[MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" [[NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+0" [[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.0" [[Preferences]] deps = ["TOML"] -git-tree-sha1 = "2cf929d64681236a2e074ffafb8d568733d2e6af" +git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.3" +version = "1.3.0" [[Printf]] deps = ["Unicode"] @@ -115,7 +167,7 @@ deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[Random]] -deps = ["Serialization"] +deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[Reexport]] @@ -125,6 +177,7 @@ version = "1.2.2" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -133,20 +186,39 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" uuid = "6462fe0b-24de-5631-8697-dd941f90decc" [[SparseArrays]] -deps = ["LinearAlgebra", "Random"] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] +git-tree-sha1 = "cee507162ecbb677450f20058ca83bd559b6b752" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.5.14" + +[[StaticArraysCore]] +git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a" +uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" +version = "1.4.0" + [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" + +[[SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" [[TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" [[Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" [[UUIDs]] deps = ["Random", "SHA"] @@ -155,14 +227,33 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" +[[UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "33af9d2031d0dc09e2be9a0d4beefec4466def8e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.0" + [[Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" + +[[libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.4.0+0" [[nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" [[p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/Project.toml b/Project.toml index 6d3a028b..8ddc72ff 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "8.6.2" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 2d4f1bd9..3f80f90a 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -22,6 +22,7 @@ include("device/indexing.jl") include("device/memory.jl") include("device/synchronization.jl") +using KernelAbstractions # host abstractions include("host/abstractarray.jl") include("host/construction.jl") diff --git a/src/device/execution.jl b/src/device/execution.jl index 41285bc3..47df5f5d 100644 --- a/src/device/execution.jl +++ b/src/device/execution.jl @@ -1,110 +1,8 @@ # kernel execution -export AbstractGPUBackend, AbstractKernelContext, gpu_call - -abstract type AbstractGPUBackend end - -abstract type AbstractKernelContext end - -import GPUArraysCore: backend - -""" - gpu_call(kernel::Function, arg0, args...; kwargs...) - -Executes `kernel` on the device that backs `arg` (see [`backend`](@ref)), passing along any -arguments `args`. Additionally, the kernel will be passed the kernel execution context (see -[`AbstractKernelContext`]), so its signature should be `(ctx::AbstractKernelContext, arg0, -args...)`. - -The keyword arguments `kwargs` are not passed to the function, but are interpreted on the -host to influence how the kernel is executed. The following keyword arguments are supported: - -- `target::AbstractArray`: specify which array object to use for determining execution - properties (defaults to the first argument `arg0`). -- `elements::Int`: how many elements will be processed by this kernel. In most - circumstances, this will correspond to the total number of threads that needs to be - launched, unless the kernel supports a variable number of elements to process per - iteration. Defaults to the length of `arg0` if no other keyword arguments that influence - the launch configuration are specified. -- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are - launched. This cannot be used in combination with the `elements` argument. -- `name::String`: inform the back end about the name of the kernel to be executed. This can - be used to emit better diagnostics, and is useful with anonymous kernels. -""" -function gpu_call(kernel::F, args::Vararg{Any,N}; - target::AbstractArray=first(args), - elements::Union{Int,Nothing}=nothing, - threads::Union{Int,Nothing}=nothing, - blocks::Union{Int,Nothing}=nothing, - name::Union{String,Nothing}=nothing) where {F,N} - # non-trivial default values for launch configuration - if elements===nothing && threads===nothing && blocks===nothing - elements = length(target) - elseif elements===nothing - if threads === nothing - threads = 1 - end - if blocks === nothing - blocks = 1 - end - elseif threads!==nothing || blocks!==nothing - error("Cannot specify both elements and threads/blocks configuration") - end - - # the number of elements to process needs to be passed to the kernel somehow, so there's - # no easy way to do this without passing additional arguments or changing the context. - # both are expensive, so require manual use of `launch_heuristic` for those kernels. - elements_per_thread = 1 - - if elements !== nothing - @assert elements > 0 - heuristic = launch_heuristic(backend(target), kernel, args...; - elements, elements_per_thread) - config = launch_configuration(backend(target), heuristic; - elements, elements_per_thread) - gpu_call(backend(target), kernel, args, config.threads, config.blocks; name=name) - else - @assert threads > 0 - @assert blocks > 0 - gpu_call(backend(target), kernel, args, threads, blocks; name=name) - end -end - -# how many threads and blocks `kernel` needs to be launched with, passing arguments `args`, -# to fully saturate the GPU. `elements` indicates the number of elements that needs to be -# processed, while `elements_per_threads` indicates the number of elements this kernel can -# process (i.e. if it's a grid-stride kernel, or 1 if otherwise). -# -# this heuristic should be specialized for the back-end, ideally using an API for maximizing -# the occupancy of the launch configuration (like CUDA's occupancy API). -function launch_heuristic(backend::AbstractGPUBackend, kernel, args...; - elements::Int, elements_per_thread::Int) - return (threads=256, blocks=32) -end - -# determine how many threads and blocks to actually launch given upper limits. -# returns a tuple of blocks, threads, and elements_per_thread (which is always 1 -# unless specified that the kernel can handle a number of elements per thread) -function launch_configuration(backend::AbstractGPUBackend, heuristic; - elements::Int, elements_per_thread::Int) - threads = clamp(elements, 1, heuristic.threads) - blocks = max(cld(elements, threads), 1) - - if elements_per_thread > 1 && blocks > heuristic.blocks - # we want to launch more blocks than required, so prefer a grid-stride loop instead - ## try to stick to the number of blocks that the heuristic suggested - blocks = heuristic.blocks - nelem = cld(elements, blocks*threads) - ## only bump the number of blocks if we really need to - if nelem > elements_per_thread - nelem = elements_per_thread - blocks = cld(elements, nelem*threads) - end - (; threads, blocks, elements_per_thread=nelem) - else - (; threads, blocks, elements_per_thread=1) - end -end - -gpu_call(backend::AbstractGPUBackend, kernel, args, threads::Int, blocks::Int; kwargs...) = - error("Not implemented") # COV_EXCL_LINE +# TODO: +# - Rename KA device to backend +# - Who owns `AbstractGPUBackend`? +# a; KernelAbstractions +# b; GPUArraysCore +backend(a) = KernelAbstractions.get_device(a) \ No newline at end of file diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl index 98c0973d..05e0e8d6 100644 --- a/src/host/abstractarray.jl +++ b/src/host/abstractarray.jl @@ -81,13 +81,12 @@ for (D, S) in ((AnyGPUArray, Array), end # kernel-based variant for copying between wrapped GPU arrays - -function linear_copy_kernel!(ctx::AbstractKernelContext, dest, dstart, src, sstart, n) - i = linear_index(ctx)-1 +# TODO: Add `@Const` to `src` +@kernel function linear_copy_kernel!(dest, dstart, src, sstart, n) + i = @index(Global, Linear) - 1 if i < n @inbounds dest[dstart+i] = src[sstart+i] end - return end function Base.copyto!(dest::AnyGPUArray, dstart::Integer, @@ -97,10 +96,8 @@ function Base.copyto!(dest::AnyGPUArray, dstart::Integer, destinds, srcinds = LinearIndices(dest), LinearIndices(src) (checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart+n-1)) || throw(BoundsError(dest, dstart:dstart+n-1)) (checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart+n-1)) || throw(BoundsError(src, sstart:sstart+n-1)) - - gpu_call(linear_copy_kernel!, - dest, dstart, src, sstart, n; - elements=n) + kernel = linear_copy_kernel!(backend(dest)) + kernel(dest, dstart, src, sstart, n; ndrange=elements) return dest end @@ -150,13 +147,9 @@ end ## generalized blocks of heterogeneous memory -function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, length) - i = linear_index(ctx) - if i <= length - idx = CartesianIndices(shape)[i] - @inbounds dest[idx + dest_offsets] = src[idx + src_offsets] - end - return +@kernel function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets) + I = @index(Global, Cartesian) + @inbounds dest[I + dest_offsets] = src[I + src_offsets] end function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{N}, @@ -170,9 +163,8 @@ function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{ dest_offsets = first(destcrange) - oneunit(CartesianIndex{N}) src_offsets = first(srccrange) - oneunit(CartesianIndex{N}) - gpu_call(cartesian_copy_kernel!, - dest, dest_offsets, src, src_offsets, shape, len; - elements=len) + kernel = cartesian_copy_kernel!(backend(dest)) + kernel(dest, dest_offsets, src, src_offsets; ndrange=shape) dest end diff --git a/src/host/base.jl b/src/host/base.jl index b840c162..96fe3fd4 100644 --- a/src/host/base.jl +++ b/src/host/base.jl @@ -26,14 +26,13 @@ end # benchmark faster by having fewer read operations and avoiding the costly division # operation. Additionally, when repeating over the trailing dimension. `inner=(ones..., n)`, # data access can be contiguous during both the read and write operations. -function repeat_inner_src_kernel!( - ctx::AbstractKernelContext, +@kernel function repeat_inner_src_kernel!( xs::AbstractArray{<:Any, N}, inner::NTuple{N, Int}, out::AbstractArray{<:Any, N} ) where {N} # Get single element from src - idx = @cartesianidx xs + idx = @index(Global, Cartesian) @inbounds val = xs[idx] # Loop over "repeat" indices of inner @@ -44,7 +43,6 @@ function repeat_inner_src_kernel!( end @inbounds out[CartesianIndex(odx)] = val end - return nothing end function repeat_inner(xs::AnyGPUArray, inner) @@ -64,23 +62,24 @@ function repeat_inner(xs::AnyGPUArray, inner) # relevant benchmarks. if argmax(inner) == firstindex(inner) # Parallelize over the destination array - gpu_call(repeat_inner_dst_kernel!, xs, inner, out; elements=prod(size(out))) + kernel = repeat_inner_dst_kernel!(backend(out)) + kernel(xs, inner, out; ndrange=size(out)) else # Parallelize over the source array - gpu_call(repeat_inner_src_kernel!, xs, inner, out; elements=prod(size(xs))) + kernel = repeat_inner_src_kernel!(backend(xs)) + kernel(xs, inner, out; ndrange=size(xs)) end return out end -function repeat_outer_kernel!( - ctx::AbstractKernelContext, +@kernel function repeat_outer_kernel!( xs::AbstractArray{<:Any, N}, xssize::NTuple{N}, outer::NTuple{N}, out::AbstractArray{<:Any, N} ) where {N} # Get index to input element - idx = @cartesianidx xs + idx = @index(Global, Cartesian) @inbounds val = xs[idx] # Loop over repeat indices, copying val to out @@ -98,7 +97,8 @@ end function repeat_outer(xs::AnyGPUArray, outer) out = similar(xs, eltype(xs), outer .* size(xs)) any(==(0), size(out)) && return out # consistent with `Base.repeat` - gpu_call(repeat_outer_kernel!, xs, size(xs), outer, out; elements=length(xs)) + kernel = repeat_outer_kernel!(backend(xs)) + kernel(xs, size(xs), outer, out; ndrange=size(xs)) return out end diff --git a/src/host/construction.jl b/src/host/construction.jl index 8cdae50b..482d50e9 100644 --- a/src/host/construction.jl +++ b/src/host/construction.jl @@ -11,29 +11,30 @@ Base.convert(::Type{T}, a::AbstractArray) where {T<:AbstractGPUArray} = a isa T function Base.fill!(A::AnyGPUArray{T}, x) where T length(A) == 0 && return A - gpu_call(A, convert(T, x)) do ctx, a, val - idx = @linearidx(a) + @kernel fill!(a, val) + idx = @index(Linear, Global) @inbounds a[idx] = val - return end + kernel = fill!(backend(A)) + kernel(A, x) A end ## identity matrices -function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T - i = linear_index(ctx) +@kernel function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T + i = @index(Global, Linear) ilin = (stride * (i - 1)) + i ilin > length(res) && return @inbounds res[ilin] = val - return end function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U} res = similar(T, dims) fill!(res, zero(U)) - gpu_call(identity_kernel, res, size(res, 1), s.λ; elements=minimum(dims)) + kernel = identity_kernel(backend(res)) + kernel(res, size(res, 1), s.λ; ndrange=minimum(dims)) res end @@ -43,7 +44,8 @@ end function Base.copyto!(A::AbstractGPUMatrix{T}, s::UniformScaling) where T fill!(A, zero(T)) - gpu_call(identity_kernel, A, size(A, 1), s.λ; elements=minimum(size(A))) + kernel = identity_kernel(backend(A)) + kernel(A, size(A, 1), s.λ; ndrange=minimum(size(A))) A end @@ -52,7 +54,8 @@ function _one(unit::T, x::AbstractGPUMatrix) where {T} m==n || throw(DimensionMismatch("multiplicative identity defined only for square matrices")) I = similar(x, T) fill!(I, zero(T)) - gpu_call(identity_kernel, I, m, unit; elements=m) + kernel = identity_kernel(backend(I)) + kernel(I, m, unit; ndrange=m) I end diff --git a/src/host/random.jl b/src/host/random.jl index 09e4257d..bd62590f 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -84,29 +84,32 @@ function Random.seed!(rng::RNG, seed::Vector{UInt32}) end function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number - gpu_call(A, rng.state) do ctx, a, randstates - idx = linear_index(ctx) - idx > length(a) && return + @kernel rand!(a, randstate) + idx = @index(Global, Linear) @inbounds a[idx] = gpu_rand(T, ctx, randstates) - return end + kernel = rand!(backend(A)) + kernel(A, rng.state) A end function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number threads = (length(A) - 1) ÷ 2 + 1 length(A) == 0 && return - gpu_call(A, rng.state; elements = threads) do ctx, a, randstates - idx = 2*(linear_index(ctx) - 1) + 1 + @kernel randn!(a, randstates) + i = @index(Global, Linear) + idx = 2*(i - 1) + 1 U1 = gpu_rand(T, ctx, randstates) U2 = gpu_rand(T, ctx, randstates) Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2) Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2) @inbounds a[idx] = Z0 - idx + 1 > length(a) && return - @inbounds a[idx + 1] = Z1 - return + if idx + 1 <= length(a) + @inbounds a[idx + 1] = Z1 + end end + kernel = randn!(backend(A)) + kernel(A, rng.states; ndrange=threads) A end diff --git a/src/host/uniformscaling.jl b/src/host/uniformscaling.jl index 848eef5e..6764a89e 100644 --- a/src/host/uniformscaling.jl +++ b/src/host/uniformscaling.jl @@ -12,20 +12,20 @@ const unittriangularwrappers = ( (:UnitLowerTriangular, :LowerTriangular) ) -function kernel_generic(ctx, B, J, min_size) - lin_idx = linear_index(ctx) - lin_idx > min_size && return nothing - @inbounds diag_idx = diagind(B)[lin_idx] - @inbounds B[diag_idx] += J - return nothing +@kernel function kernel_generic(ctx, B, J, min_size) + lin_idx = @index(Global, Linear) + if lin_idx <= min_size + @inbounds diag_idx = diagind(B)[lin_idx] + @inbounds B[diag_idx] += J + end end -function kernel_unittriangular(ctx, B, J, diagonal_val, min_size) - lin_idx = linear_index(ctx) - lin_idx > min_size && return nothing - @inbounds diag_idx = diagind(B)[lin_idx] - @inbounds B[diag_idx] = diagonal_val + J - return nothing +@kernel function kernel_unittriangular(ctx, B, J, diagonal_val, min_size) + lin_idx = @index(Global, Linear) + if lin_idx <= min_size + @inbounds diag_idx = diagind(B)[lin_idx] + @inbounds B[diag_idx] = diagonal_val + J + end end for (t1, t2) in unittriangularwrappers @@ -34,7 +34,8 @@ for (t1, t2) in unittriangularwrappers B = similar(parent(A), typeof(oneunit(T) + J)) copyto!(B, parent(A)) min_size = minimum(size(B)) - gpu_call(kernel_unittriangular, B, J, one(eltype(B)), min_size; elements=min_size) + kernel = kernel_unittriangular(backend(B)) + kernel(B, J, one(eltype(B)), min_size; ndrange=min_size) return $t2(B) end @@ -42,7 +43,8 @@ for (t1, t2) in unittriangularwrappers B = similar(parent(A), typeof(J - oneunit(T))) B .= .- parent(A) min_size = minimum(size(B)) - gpu_call(kernel_unittriangular, B, J, -one(eltype(B)), min_size; elements=min_size) + kernel = kernel_unittriangular(backend(B)) + kernel(B, J, -one(eltype(B)), min_size; ndrange=min_size) return $t2(B) end end @@ -54,7 +56,8 @@ for t in genericwrappers B = similar(parent(A), typeof(oneunit(T) + J)) copyto!(B, parent(A)) min_size = minimum(size(B)) - gpu_call(kernel_generic, B, J, min_size; elements=min_size) + kernel = kernel_generic(backend()) + kernel(B, J, min_size; ndrange=min_size) return $t(B) end @@ -62,7 +65,8 @@ for t in genericwrappers B = similar(parent(A), typeof(J - oneunit(T))) B .= .- parent(A) min_size = minimum(size(B)) - gpu_call(kernel_generic, B, J, min_size; elements=min_size) + kernel = kernel_generic(backend()) + kernel(B, J, min_size; ndrange=min_size) return $t(B) end end @@ -73,7 +77,8 @@ function (+)(A::Hermitian{T,<:AbstractGPUMatrix}, J::UniformScaling{<:Complex}) B = similar(parent(A), typeof(oneunit(T) + J)) copyto!(B, parent(A)) min_size = minimum(size(B)) - gpu_call(kernel_generic, B, J, min_size; elements=min_size) + kernel = kernel_generic(backend()) + kernel(B, J, min_size; ndrange=min_size) return B end @@ -81,7 +86,8 @@ function (-)(J::UniformScaling{<:Complex}, A::Hermitian{T,<:AbstractGPUMatrix}) B = similar(parent(A), typeof(J - oneunit(T))) B .= .-parent(A) min_size = minimum(size(B)) - gpu_call(kernel_generic, B, J, min_size; elements=min_size) + kernel = kernel_generic(backend()) + kernel(B, J, min_size; ndrange=min_size) return B end @@ -90,7 +96,8 @@ function (+)(A::AbstractGPUMatrix{T}, J::UniformScaling) where T B = similar(A, typeof(oneunit(T) + J)) copyto!(B, A) min_size = minimum(size(B)) - gpu_call(kernel_generic, B, J, min_size; elements=min_size) + kernel = kernel_generic(backend()) + kernel(B, J, min_size; ndrange=min_size) return B end @@ -98,6 +105,7 @@ function (-)(J::UniformScaling, A::AbstractGPUMatrix{T}) where T B = similar(A, typeof(J - oneunit(T))) B .= .-A min_size = minimum(size(B)) - gpu_call(kernel_generic, B, J, min_size; elements=min_size) + kernel = kernel_generic(backend()) + kernel(B, J, min_size; ndrange=min_size) return B end diff --git a/test/runtests.jl b/test/runtests.jl index 9c4c5dde..6aeea549 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,20 +2,20 @@ using GPUArrays, Test, Pkg include("testsuite.jl") -@testset "JLArray" begin - # install the JLArrays subpackage in a temporary environment - old_project = Base.active_project() - Pkg.activate(; temp=true) - Pkg.develop(path=joinpath(dirname(@__DIR__), "lib", "JLArrays")) +# @testset "JLArray" begin +# # install the JLArrays subpackage in a temporary environment +# old_project = Base.active_project() +# Pkg.activate(; temp=true) +# Pkg.develop(path=joinpath(dirname(@__DIR__), "lib", "JLArrays")) - using JLArrays +# using JLArrays - jl([1]) +# jl([1]) - TestSuite.test(JLArray) +# TestSuite.test(JLArray) - Pkg.activate(old_project) -end +# Pkg.activate(old_project) +# end @testset "Array" begin TestSuite.test(Array)