From 38605eef454f3a4196e814338149982e5c69be4b Mon Sep 17 00:00:00 2001 From: James Schloss Date: Wed, 10 Apr 2024 18:49:09 +0200 Subject: [PATCH 1/8] adding necessary changes for KA transition for gpuarrays --- src/gpuarrays.jl | 48 +++--------------------------------------------- 1 file changed, 3 insertions(+), 45 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 614e655d..fd46a859 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,5 +1,7 @@ # GPUArrays.jl interface +import KernelAbstractions +import KernelAbstractions: Backend # # Device functionality @@ -8,9 +10,7 @@ ## execution -struct oneArrayBackend <: AbstractGPUBackend end - -struct oneKernelContext <: AbstractKernelContext end +struct oneArrayBackend <: Backend end @inline function GPUArrays.launch_heuristic(::oneArrayBackend, f::F, args::Vararg{Any,N}; elements::Int, elements_per_thread::Int) where {F,N} @@ -23,48 +23,6 @@ struct oneKernelContext <: AbstractKernelContext end return (threads=items, blocks=32) end -function GPUArrays.gpu_call(::oneArrayBackend, f, args, threads::Int, blocks::Int; - name::Union{String,Nothing}) - @oneapi items=threads groups=blocks name=name f(oneKernelContext(), args...) -end - - -## on-device - -# indexing - -GPUArrays.blockidx(ctx::oneKernelContext) = oneAPI.get_group_id(0) -GPUArrays.blockdim(ctx::oneKernelContext) = oneAPI.get_local_size(0) -GPUArrays.threadidx(ctx::oneKernelContext) = oneAPI.get_local_id(0) -GPUArrays.griddim(ctx::oneKernelContext) = oneAPI.get_num_groups(0) - -# math - -@inline GPUArrays.cos(ctx::oneKernelContext, x) = oneAPI.cos(x) -@inline GPUArrays.sin(ctx::oneKernelContext, x) = oneAPI.sin(x) -@inline GPUArrays.sqrt(ctx::oneKernelContext, x) = oneAPI.sqrt(x) -@inline GPUArrays.log(ctx::oneKernelContext, x) = oneAPI.log(x) - -# memory - -@inline function GPUArrays.LocalMemory(::oneKernelContext, ::Type{T}, ::Val{dims}, ::Val{id} - ) where {T, dims, id} - ptr = oneAPI.emit_localmemory(Val(id), T, Val(prod(dims))) - oneDeviceArray(dims, LLVMPtr{T, onePI.AS.Local}(ptr)) -end - -# synchronization - -@inline GPUArrays.synchronize_threads(::oneKernelContext) = oneAPI.barrier() - - - -# -# Host abstractions -# - -GPUArrays.backend(::Type{<:oneArray}) = oneArrayBackend() - const GLOBAL_RNGs = Dict{ZeDevice,GPUArrays.RNG}() function GPUArrays.default_rng(::Type{<:oneArray}) dev = device() From 9e50975b4fa170d74d2187bf5fceaf840faeb84b Mon Sep 17 00:00:00 2001 From: James Schloss Date: Thu, 25 Apr 2024 12:42:36 +0200 Subject: [PATCH 2/8] modifying ci build script to point to correct branch --- deps/build_ci.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/deps/build_ci.jl b/deps/build_ci.jl index 831872c8..68ed8c9c 100644 --- a/deps/build_ci.jl +++ b/deps/build_ci.jl @@ -16,6 +16,7 @@ if isdir(joinpath(@__DIR__), "..", ".git") @info "Latest change to the wrappers: $(unix2datetime(deps_timestamp))" # find out which version of oneAPI_Support_jll we are using + Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time") Pkg.activate(joinpath(@__DIR__, "..")) Pkg.instantiate() deps = collect(values(Pkg.dependencies())) From 1cc6089e6cc809d40338b8165ba2c6fe7859ce84 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 12:28:02 +0200 Subject: [PATCH 3/8] updating ctx for launch_heuristic --- src/gpuarrays.jl | 12 ++++++++---- src/oneAPI.jl | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index fd46a859..87a14dd4 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -10,11 +10,15 @@ import KernelAbstractions: Backend ## execution -struct oneArrayBackend <: Backend end - -@inline function GPUArrays.launch_heuristic(::oneArrayBackend, f::F, args::Vararg{Any,N}; +@inline function GPUArrays.launch_heuristic(::oneAPIBackend, f::F, args::Vararg{Any,N}; elements::Int, elements_per_thread::Int) where {F,N} - kernel = @oneapi launch=false f(oneKernelContext(), args...) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing, + nothing) + + # this might not be the final context, since we may tune the workgroupsize + ctx = KA.mkcontext(obj, ndrange, iterspace) + + kernel = @oneapi launch=false f(ctx, args...) items = launch_configuration(kernel) # XXX: how many groups is a good number? the API doesn't tell us. diff --git a/src/oneAPI.jl b/src/oneAPI.jl index 259fea77..5458c167 100644 --- a/src/oneAPI.jl +++ b/src/oneAPI.jl @@ -67,14 +67,14 @@ end # integrations and specialized functionality include("broadcast.jl") include("mapreduce.jl") -include("gpuarrays.jl") -include("random.jl") -include("utils.jl") - include("oneAPIKernels.jl") import .oneAPIKernels: oneAPIBackend export oneAPIBackend +include("gpuarrays.jl") +include("random.jl") +include("utils.jl") + function __init__() precompiling = ccall(:jl_generating_output, Cint, ()) != 0 precompiling && return From 1c9789f28206ab4ea5b598c9a689b6e779adf635 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 22 Jul 2024 23:52:33 +0200 Subject: [PATCH 4/8] mimicking CUDA --- src/gpuarrays.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 87a14dd4..0330464b 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -10,15 +10,16 @@ import KernelAbstractions: Backend ## execution -@inline function GPUArrays.launch_heuristic(::oneAPIBackend, f::F, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {F,N} - ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing, +@inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N}; + elements::Int, elements_per_thread::Int) where {O,N} + ndrange = ceil(Int, elements / elements_per_thread) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) # this might not be the final context, since we may tune the workgroupsize ctx = KA.mkcontext(obj, ndrange, iterspace) - kernel = @oneapi launch=false f(ctx, args...) + kernel = @oneapi launch=false obj.f(ctx, args...) items = launch_configuration(kernel) # XXX: how many groups is a good number? the API doesn't tell us. From b90254214428412ad057f0848658e7344e4747b8 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Tue, 23 Jul 2024 09:33:24 +0200 Subject: [PATCH 5/8] copying CUDA --- src/oneAPI.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/oneAPI.jl b/src/oneAPI.jl index 5458c167..7dfa0532 100644 --- a/src/oneAPI.jl +++ b/src/oneAPI.jl @@ -17,6 +17,8 @@ using Core: LLVMPtr using SPIRV_LLVM_Translator_unified_jll, SPIRV_Tools_jll +import KernelAbstractions as KA + export oneL0 # core library @@ -69,7 +71,7 @@ include("broadcast.jl") include("mapreduce.jl") include("oneAPIKernels.jl") import .oneAPIKernels: oneAPIBackend -export oneAPIBackend +export oneAPIBackend, KA.launch_config include("gpuarrays.jl") include("random.jl") From 867bd46a7f713fd486315d0b1fcbf95c2dbe6737 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Tue, 23 Jul 2024 10:40:57 +0200 Subject: [PATCH 6/8] take 2 --- src/gpuarrays.jl | 2 +- src/oneAPI.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 0330464b..1cc39492 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -11,7 +11,7 @@ import KernelAbstractions: Backend ## execution @inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {O,N} + elements::Int, elements_per_thread::Int) where {O,N} ndrange = ceil(Int, elements / elements_per_thread) ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing) diff --git a/src/oneAPI.jl b/src/oneAPI.jl index 7dfa0532..00d83593 100644 --- a/src/oneAPI.jl +++ b/src/oneAPI.jl @@ -70,8 +70,8 @@ end include("broadcast.jl") include("mapreduce.jl") include("oneAPIKernels.jl") -import .oneAPIKernels: oneAPIBackend -export oneAPIBackend, KA.launch_config +import .oneAPIKernels: oneAPIBackend, KA.launch_config +export oneAPIBackend include("gpuarrays.jl") include("random.jl") From 319382f0a994d3c5276611076b1875a87b7fe6f6 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Thu, 25 Jul 2024 15:23:22 +0200 Subject: [PATCH 7/8] removing heuristic --- src/gpuarrays.jl | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 1cc39492..d15bac6f 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,8 +1,5 @@ # GPUArrays.jl interface -import KernelAbstractions -import KernelAbstractions: Backend - # # Device functionality # @@ -10,24 +7,6 @@ import KernelAbstractions: Backend ## execution -@inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {O,N} - ndrange = ceil(Int, elements / elements_per_thread) - ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, - nothing) - - # this might not be the final context, since we may tune the workgroupsize - ctx = KA.mkcontext(obj, ndrange, iterspace) - - kernel = @oneapi launch=false obj.f(ctx, args...) - - items = launch_configuration(kernel) - # XXX: how many groups is a good number? the API doesn't tell us. - # measured on a low-end IGP, 32 blocks seems like a good sweet spot. - # note that this only matters for grid-stride kernels, like broadcast. - return (threads=items, blocks=32) -end - const GLOBAL_RNGs = Dict{ZeDevice,GPUArrays.RNG}() function GPUArrays.default_rng(::Type{<:oneArray}) dev = device() From a8e255425ec3a1d06ec8299dfe8481692bf220f6 Mon Sep 17 00:00:00 2001 From: James Schloss Date: Mon, 16 Sep 2024 14:07:33 +0200 Subject: [PATCH 8/8] Revert "removing heuristic" This reverts commit 54796ad22c6b74dcce0d48f1c69ad5eb8b0a5219. --- src/gpuarrays.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index d15bac6f..1cc39492 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,5 +1,8 @@ # GPUArrays.jl interface +import KernelAbstractions +import KernelAbstractions: Backend + # # Device functionality # @@ -7,6 +10,24 @@ ## execution +@inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N}; + elements::Int, elements_per_thread::Int) where {O,N} + ndrange = ceil(Int, elements / elements_per_thread) + ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, + nothing) + + # this might not be the final context, since we may tune the workgroupsize + ctx = KA.mkcontext(obj, ndrange, iterspace) + + kernel = @oneapi launch=false obj.f(ctx, args...) + + items = launch_configuration(kernel) + # XXX: how many groups is a good number? the API doesn't tell us. + # measured on a low-end IGP, 32 blocks seems like a good sweet spot. + # note that this only matters for grid-stride kernels, like broadcast. + return (threads=items, blocks=32) +end + const GLOBAL_RNGs = Dict{ZeDevice,GPUArrays.RNG}() function GPUArrays.default_rng(::Type{<:oneArray}) dev = device()