diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 551b9a1eba..93ea9ea21c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -43,7 +43,6 @@ steps: # then, test supported CUDA toolkits (installed through the artifact system) - group: "CUDA" key: "cuda" - depends_on: "julia" steps: - label: "CUDA {{matrix.cuda}}" plugins: @@ -84,6 +83,33 @@ steps: echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml + - group: "Memory" + key: "memory" + steps: + - label: "CuArray with {{matrix.memory}} memory" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + - JuliaCI/julia-test#v1: + test_args: "--quickfail core base libraries" + - JuliaCI/julia-coverage#v1: + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip memory\]/ && !build.pull_request.draft + timeout_in_minutes: 30 + matrix: + setup: + memory: + - "unified" + - "host" + commands: | + echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml + - group: ":nesting_dolls: Subpackages" depends_on: "cuda" steps: @@ -121,6 +147,7 @@ steps: using Pkg println("--- :julia: Instantiating project") + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do Pkg.activate(joinpath(pwd(), "lib", lowercase("{{matrix.package}}"))) try @@ -157,6 +184,7 @@ steps: julia --project -e ' using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") println("--- :julia: Instantiating project") withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do Pkg.instantiate() @@ -248,11 +276,15 @@ steps: run_tests: false command: | julia --project -e ' + using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") using CUDA @assert !CUDA.functional() @assert !isdefined(CUDA, :libcudart) CUDA.set_runtime_version!(v"11.6")' julia --project -e ' + using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") using CUDA @assert !CUDA.functional() @assert isdefined(CUDA, :libcudart)' @@ -407,6 +439,7 @@ steps: julia --project -e ' using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") println("--- :julia: Instantiating project") Pkg.resolve() Pkg.instantiate() @@ -441,6 +474,7 @@ steps: command: | julia --project -e ' using Pkg + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"] println("--- :julia: Instantiating project") diff --git a/src/CUDA.jl b/src/CUDA.jl index 92c5a1b7ee..1795b67ba8 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -43,6 +43,8 @@ using Libdl import NVTX +import KernelAbstractions as KA + using Printf @@ -83,6 +85,11 @@ include("compiler/execution.jl") include("compiler/exceptions.jl") include("compiler/reflection.jl") +# KernelAbstractions +include("CUDAKernels.jl") +import .CUDAKernels: CUDABackend, KA.launch_config +export CUDABackend + # array implementation include("gpuarrays.jl") include("utilities.jl") @@ -111,6 +118,9 @@ export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND const has_cusolvermg = CUSOLVER.has_cusolvermg export has_cusolvermg +# KA Backend Definition +KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend() + # random depends on CURAND include("random.jl") @@ -119,11 +129,6 @@ include("../lib/nvml/NVML.jl") const has_nvml = NVML.has_nvml export NVML, has_nvml -# KernelAbstractions -include("CUDAKernels.jl") -import .CUDAKernels: CUDABackend -export CUDABackend - # StaticArrays is still a direct dependency, so directly include the extension include("../ext/StaticArraysExt.jl") diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index 23db958933..8298f74426 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -25,7 +25,6 @@ KA.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims) KA.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims) KA.get_backend(::CuArray) = CUDABackend() -KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend() KA.synchronize(::CUDABackend) = synchronize() Adapt.adapt_storage(::CUDABackend, a::Array) = Adapt.adapt(CuArray, a) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index e3aec24c3f..54f6731c6c 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,6 +1,5 @@ # GPUArrays.jl interface - # # Device functionality # @@ -8,13 +7,13 @@ ## execution -struct CuArrayBackend <: AbstractGPUBackend end - -struct CuKernelContext <: AbstractKernelContext end +@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N}; + elements::Int, elements_per_thread::Int) where {O,N} -@inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {F,N} - kernel = @cuda launch=false f(CuKernelContext(), args...) + ndrange = ceil(Int, elements / elements_per_thread) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) + ctx = KA.mkcontext(obj, ndrange, iterspace) + kernel = @cuda launch=false obj.f(ctx, args...) # launching many large blocks) lowers performance, as observed with broadcast, so cap # the block size if we don't have a grid-stride kernel (which would keep the grid small) @@ -24,39 +23,3 @@ struct CuKernelContext <: AbstractKernelContext end launch_configuration(kernel.fun; max_threads=256) end end - -@inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int, - blocks::Int; name::Union{String,Nothing}) where {F,TT} - @cuda threads blocks name f(CuKernelContext(), args...) -end - - -## on-device - -# indexing - -GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x -GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x -GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x -GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x - -# memory - -@inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id} - ) where {T, dims, id} - ptr = CUDA._shmem(Val(id), T, Val(prod(dims))) - ptr = reinterpret(LLVMPtr{T, AS.Shared}, ptr) - CuDeviceArray{T,length(dims),AS.Shared}(ptr, dims) -end - -# synchronization - -@inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads() - - - -# -# Host abstractions -# - -GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()