From b828c8d27e22a776e142cc02016a3e44010e7575 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Thu, 4 Apr 2024 13:06:24 -0400 Subject: [PATCH 1/8] new GPUArrays interface for KA transition --- src/gpuarrays.jl | 42 +++--------------------------------------- 1 file changed, 3 insertions(+), 39 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index e3aec24c3f..ec0771c706 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,5 +1,7 @@ # GPUArrays.jl interface +import KernelAbstractions +import KernelAbstractions: Backend # # Device functionality @@ -8,9 +10,7 @@ ## execution -struct CuArrayBackend <: AbstractGPUBackend end - -struct CuKernelContext <: AbstractKernelContext end +struct CuArrayBackend <: Backend end @inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N}; elements::Int, elements_per_thread::Int) where {F,N} @@ -24,39 +24,3 @@ struct CuKernelContext <: AbstractKernelContext end launch_configuration(kernel.fun; max_threads=256) end end - -@inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int, - blocks::Int; name::Union{String,Nothing}) where {F,TT} - @cuda threads blocks name f(CuKernelContext(), args...) -end - - -## on-device - -# indexing - -GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x -GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x -GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x -GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x - -# memory - -@inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id} - ) where {T, dims, id} - ptr = CUDA._shmem(Val(id), T, Val(prod(dims))) - ptr = reinterpret(LLVMPtr{T, AS.Shared}, ptr) - CuDeviceArray{T,length(dims),AS.Shared}(ptr, dims) -end - -# synchronization - -@inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads() - - - -# -# Host abstractions -# - -GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend() From 9590be3395273ef3c2d42f61e08d9de72744f2ef Mon Sep 17 00:00:00 2001 From: James Schloss Date: Tue, 23 Apr 2024 13:35:59 +0200 Subject: [PATCH 2/8] buildkite: remove this before merging --- .buildkite/pipeline.yml | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 551b9a1eba..93ea9ea21c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -43,7 +43,6 @@ steps: # then, test supported CUDA toolkits (installed through the artifact system) - group: "CUDA" key: "cuda" - depends_on: "julia" steps: - label: "CUDA {{matrix.cuda}}" plugins: @@ -84,6 +83,33 @@ steps: echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml + - group: "Memory" + key: "memory" + steps: + - label: "CuArray with {{matrix.memory}} memory" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + - JuliaCI/julia-test#v1: + test_args: "--quickfail core base libraries" + - JuliaCI/julia-coverage#v1: + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip memory\]/ && !build.pull_request.draft + timeout_in_minutes: 30 + matrix: + setup: + memory: + - "unified" + - "host" + commands: | + echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml + - group: ":nesting_dolls: Subpackages" depends_on: "cuda" steps: @@ -121,6 +147,7 @@ steps: using Pkg println("--- :julia: Instantiating project") + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do Pkg.activate(joinpath(pwd(), "lib", lowercase("{{matrix.package}}"))) try @@ -157,6 +184,7 @@ steps: julia --project -e ' using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") println("--- :julia: Instantiating project") withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do Pkg.instantiate() @@ -248,11 +276,15 @@ steps: run_tests: false command: | julia --project -e ' + using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") using CUDA @assert !CUDA.functional() @assert !isdefined(CUDA, :libcudart) CUDA.set_runtime_version!(v"11.6")' julia --project -e ' + using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") using CUDA @assert !CUDA.functional() @assert isdefined(CUDA, :libcudart)' @@ -407,6 +439,7 @@ steps: julia --project -e ' using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") println("--- :julia: Instantiating project") Pkg.resolve() Pkg.instantiate() @@ -441,6 +474,7 @@ steps: command: | julia --project -e ' using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] println("--- :julia: Instantiating project") From fc151cda854f96ae656854125d309e5a88a8ae30 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 11:26:31 +0200 Subject: [PATCH 3/8] CuArrayBackend -> CUDABackend --- src/CUDA.jl | 13 ++++++++----- src/CUDAKernels.jl | 1 - src/gpuarrays.jl | 14 ++++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/CUDA.jl b/src/CUDA.jl index 92c5a1b7ee..83322dbaad 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -83,6 +83,11 @@ include("compiler/execution.jl") include("compiler/exceptions.jl") include("compiler/reflection.jl") +# KernelAbstractions +include("CUDAKernels.jl") +import .CUDAKernels: CUDABackend, KA +export CUDABackend + # array implementation include("gpuarrays.jl") include("utilities.jl") @@ -111,6 +116,9 @@ export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND const has_cusolvermg = CUSOLVER.has_cusolvermg export has_cusolvermg +# KA Backend Definition +KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend() + # random depends on CURAND include("random.jl") @@ -119,11 +127,6 @@ include("../lib/nvml/NVML.jl") const has_nvml = NVML.has_nvml export NVML, has_nvml -# KernelAbstractions -include("CUDAKernels.jl") -import .CUDAKernels: CUDABackend -export CUDABackend - # StaticArrays is still a direct dependency, so directly include the extension include("../ext/StaticArraysExt.jl") diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index 23db958933..8298f74426 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -25,7 +25,6 @@ KA.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims) KA.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims) KA.get_backend(::CuArray) = CUDABackend() -KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend() KA.synchronize(::CUDABackend) = synchronize() Adapt.adapt_storage(::CUDABackend, a::Array) = Adapt.adapt(CuArray, a) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index ec0771c706..9bb1e07a78 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,8 +1,5 @@ # GPUArrays.jl interface -import KernelAbstractions -import KernelAbstractions: Backend - # # Device functionality # @@ -10,11 +7,16 @@ import KernelAbstractions: Backend ## execution -struct CuArrayBackend <: Backend end -@inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N}; +@inline function GPUArrays.launch_heuristic(::CUDABackend, f::F, args::Vararg{Any,N}; elements::Int, elements_per_thread::Int) where {F,N} - kernel = @cuda launch=false f(CuKernelContext(), args...) + + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing, + nothing) + + # this might not be the final context, since we may tune the workgroupsize + ctx = KA.mkcontext(obj, ndrange, iterspace) + kernel = @cuda launch=false f(ctx, args...) # launching many large blocks) lowers performance, as observed with broadcast, so cap # the block size if we don't have a grid-stride kernel (which would keep the grid small) From ca1c6893223ee7ccdecc105713465c500684bede Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 10:28:02 -0400 Subject: [PATCH 4/8] attempting a new launch_heuristic --- src/CUDA.jl | 4 +++- src/gpuarrays.jl | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/CUDA.jl b/src/CUDA.jl index 83322dbaad..1795b67ba8 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -43,6 +43,8 @@ using Libdl import NVTX +import KernelAbstractions as KA + using Printf @@ -85,7 +87,7 @@ include("compiler/reflection.jl") # KernelAbstractions include("CUDAKernels.jl") -import .CUDAKernels: CUDABackend, KA +import .CUDAKernels: CUDABackend, KA.launch_config export CUDABackend # array implementation diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 9bb1e07a78..c5ab1e79cb 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -11,12 +11,13 @@ @inline function GPUArrays.launch_heuristic(::CUDABackend, f::F, args::Vararg{Any,N}; elements::Int, elements_per_thread::Int) where {F,N} + obj = f(CUDABackend()) ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing, nothing) # this might not be the final context, since we may tune the workgroupsize ctx = KA.mkcontext(obj, ndrange, iterspace) - kernel = @cuda launch=false f(ctx, args...) + kernel = @cuda launch=false obj.f(ctx, args...) # launching many large blocks) lowers performance, as observed with broadcast, so cap # the block size if we don't have a grid-stride kernel (which would keep the grid small) From aeaa29d47047979c936d4ed3a047b67646f7ac95 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 10:52:12 -0400 Subject: [PATCH 5/8] meh --- src/gpuarrays.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index c5ab1e79cb..ffe48d9845 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -12,7 +12,8 @@ elements::Int, elements_per_thread::Int) where {F,N} obj = f(CUDABackend()) - ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing, + ndrange = ceil(Int, elements / elements_per_thread) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) # this might not be the final context, since we may tune the workgroupsize From 0a0c211d40fc92d1e29525cbe0431537bd936972 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 17:48:59 -0400 Subject: [PATCH 6/8] I think the tests will pass now --- src/gpuarrays.jl | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index ffe48d9845..54f6731c6c 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -7,16 +7,11 @@ ## execution +@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N}; + elements::Int, elements_per_thread::Int) where {O,N} -@inline function GPUArrays.launch_heuristic(::CUDABackend, f::F, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {F,N} - - obj = f(CUDABackend()) ndrange = ceil(Int, elements / elements_per_thread) - ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, - nothing) - - # this might not be the final context, since we may tune the workgroupsize + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) ctx = KA.mkcontext(obj, ndrange, iterspace) kernel = @cuda launch=false obj.f(ctx, args...) From d1d446fadc5bf46d3c9723c3af805b9c8c6c7797 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Thu, 25 Jul 2024 15:22:45 +0200 Subject: [PATCH 7/8] removing launch_heuristic --- src/CUDA.jl | 1 - src/gpuarrays.jl | 25 ------------------------- 2 files changed, 26 deletions(-) delete mode 100644 src/gpuarrays.jl diff --git a/src/CUDA.jl b/src/CUDA.jl index 1795b67ba8..12f70679b3 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -91,7 +91,6 @@ import .CUDAKernels: CUDABackend, KA.launch_config export CUDABackend # array implementation -include("gpuarrays.jl") include("utilities.jl") include("texture.jl") diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl deleted file mode 100644 index 54f6731c6c..0000000000 --- a/src/gpuarrays.jl +++ /dev/null @@ -1,25 +0,0 @@ -# GPUArrays.jl interface - -# -# Device functionality -# - - -## execution - -@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {O,N} - - ndrange = ceil(Int, elements / elements_per_thread) - ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) - ctx = KA.mkcontext(obj, ndrange, iterspace) - kernel = @cuda launch=false obj.f(ctx, args...) - - # launching many large blocks) lowers performance, as observed with broadcast, so cap - # the block size if we don't have a grid-stride kernel (which would keep the grid small) - if elements_per_thread > 1 - launch_configuration(kernel.fun) - else - launch_configuration(kernel.fun; max_threads=256) - end -end From c7831bcbb3fea8c76bf0c8a0c12b204e81e7bc55 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 16 Sep 2024 14:06:16 +0200 Subject: [PATCH 8/8] Revert "removing launch_heuristic" This reverts commit d1d446fadc5bf46d3c9723c3af805b9c8c6c7797. --- src/CUDA.jl | 1 + src/gpuarrays.jl | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 src/gpuarrays.jl diff --git a/src/CUDA.jl b/src/CUDA.jl index 12f70679b3..1795b67ba8 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -91,6 +91,7 @@ import .CUDAKernels: CUDABackend, KA.launch_config export CUDABackend # array implementation +include("gpuarrays.jl") include("utilities.jl") include("texture.jl") diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl new file mode 100644 index 0000000000..54f6731c6c --- /dev/null +++ b/src/gpuarrays.jl @@ -0,0 +1,25 @@ +# GPUArrays.jl interface + +# +# Device functionality +# + + +## execution + +@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N}; + elements::Int, elements_per_thread::Int) where {O,N} + + ndrange = ceil(Int, elements / elements_per_thread) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) + ctx = KA.mkcontext(obj, ndrange, iterspace) + kernel = @cuda launch=false obj.f(ctx, args...) + + # launching many large blocks) lowers performance, as observed with broadcast, so cap + # the block size if we don't have a grid-stride kernel (which would keep the grid small) + if elements_per_thread > 1 + launch_configuration(kernel.fun) + else + launch_configuration(kernel.fun; max_threads=256) + end +end