From 38605eef454f3a4196e814338149982e5c69be4b Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Wed, 10 Apr 2024 18:49:09 +0200
Subject: [PATCH 1/8] adding necessary changes for KA transition for gpuarrays

---
 src/gpuarrays.jl | 48 +++---------------------------------------------
 1 file changed, 3 insertions(+), 45 deletions(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index 614e655d..fd46a859 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -1,5 +1,7 @@
 # GPUArrays.jl interface
 
+import KernelAbstractions
+import KernelAbstractions: Backend
 
 #
 # Device functionality
@@ -8,9 +10,7 @@
 
 ## execution
 
-struct oneArrayBackend <: AbstractGPUBackend end
-
-struct oneKernelContext <: AbstractKernelContext end
+struct oneArrayBackend <: Backend end
 
 @inline function GPUArrays.launch_heuristic(::oneArrayBackend, f::F, args::Vararg{Any,N};
                                              elements::Int, elements_per_thread::Int) where {F,N}
@@ -23,48 +23,6 @@ struct oneKernelContext <: AbstractKernelContext end
     return (threads=items, blocks=32)
 end
 
-function GPUArrays.gpu_call(::oneArrayBackend, f, args, threads::Int, blocks::Int;
-                            name::Union{String,Nothing})
-    @oneapi items=threads groups=blocks name=name f(oneKernelContext(), args...)
-end
-
-
-## on-device
-
-# indexing
-
-GPUArrays.blockidx(ctx::oneKernelContext) = oneAPI.get_group_id(0)
-GPUArrays.blockdim(ctx::oneKernelContext) = oneAPI.get_local_size(0)
-GPUArrays.threadidx(ctx::oneKernelContext) = oneAPI.get_local_id(0)
-GPUArrays.griddim(ctx::oneKernelContext) = oneAPI.get_num_groups(0)
-
-# math
-
-@inline GPUArrays.cos(ctx::oneKernelContext, x) = oneAPI.cos(x)
-@inline GPUArrays.sin(ctx::oneKernelContext, x) = oneAPI.sin(x)
-@inline GPUArrays.sqrt(ctx::oneKernelContext, x) = oneAPI.sqrt(x)
-@inline GPUArrays.log(ctx::oneKernelContext, x) = oneAPI.log(x)
-
-# memory
-
-@inline function GPUArrays.LocalMemory(::oneKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
-                                      ) where {T, dims, id}
-    ptr = oneAPI.emit_localmemory(Val(id), T, Val(prod(dims)))
-    oneDeviceArray(dims, LLVMPtr{T, onePI.AS.Local}(ptr))
-end
-
-# synchronization
-
-@inline GPUArrays.synchronize_threads(::oneKernelContext) = oneAPI.barrier()
-
-
-
-#
-# Host abstractions
-#
-
-GPUArrays.backend(::Type{<:oneArray}) = oneArrayBackend()
-
 const GLOBAL_RNGs = Dict{ZeDevice,GPUArrays.RNG}()
 function GPUArrays.default_rng(::Type{<:oneArray})
     dev = device()

From 9e50975b4fa170d74d2187bf5fceaf840faeb84b Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 25 Apr 2024 12:42:36 +0200
Subject: [PATCH 2/8] modifying ci build script to point to correct branch

---
 deps/build_ci.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deps/build_ci.jl b/deps/build_ci.jl
index 831872c8..68ed8c9c 100644
--- a/deps/build_ci.jl
+++ b/deps/build_ci.jl
@@ -16,6 +16,7 @@ if isdir(joinpath(@__DIR__), "..", ".git")
     @info "Latest change to the wrappers: $(unix2datetime(deps_timestamp))"
 
     # find out which version of oneAPI_Support_jll we are using
+    Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
     Pkg.activate(joinpath(@__DIR__, ".."))
     Pkg.instantiate()
     deps = collect(values(Pkg.dependencies()))

From 1cc6089e6cc809d40338b8165ba2c6fe7859ce84 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 22 Jul 2024 12:28:02 +0200
Subject: [PATCH 3/8] updating ctx for launch_heuristic

---
 src/gpuarrays.jl | 12 ++++++++----
 src/oneAPI.jl    |  8 ++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index fd46a859..87a14dd4 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -10,11 +10,15 @@ import KernelAbstractions: Backend
 
 ## execution
 
-struct oneArrayBackend <: Backend end
-
-@inline function GPUArrays.launch_heuristic(::oneArrayBackend, f::F, args::Vararg{Any,N};
+@inline function GPUArrays.launch_heuristic(::oneAPIBackend, f::F, args::Vararg{Any,N};
                                              elements::Int, elements_per_thread::Int) where {F,N}
-    kernel = @oneapi launch=false f(oneKernelContext(), args...)
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing,
+                                                                  nothing)
+
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+
+    kernel = @oneapi launch=false f(ctx, args...)
 
     items = launch_configuration(kernel)
     # XXX: how many groups is a good number? the API doesn't tell us.
diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index 259fea77..5458c167 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -67,14 +67,14 @@ end
 # integrations and specialized functionality
 include("broadcast.jl")
 include("mapreduce.jl")
-include("gpuarrays.jl")
-include("random.jl")
-include("utils.jl")
-
 include("oneAPIKernels.jl")
 import .oneAPIKernels: oneAPIBackend
 export oneAPIBackend
 
+include("gpuarrays.jl")
+include("random.jl")
+include("utils.jl")
+
 function __init__()
     precompiling = ccall(:jl_generating_output, Cint, ()) != 0
     precompiling && return

From 1c9789f28206ab4ea5b598c9a689b6e779adf635 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 22 Jul 2024 23:52:33 +0200
Subject: [PATCH 4/8] mimicking CUDA

---
 src/gpuarrays.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index 87a14dd4..0330464b 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -10,15 +10,16 @@ import KernelAbstractions: Backend
 
 ## execution
 
-@inline function GPUArrays.launch_heuristic(::oneAPIBackend, f::F, args::Vararg{Any,N};
-                                             elements::Int, elements_per_thread::Int) where {F,N}
-    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing,
+@inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N};
+                                             elements::Int, elements_per_thread::Int) where {O,N}
+    ndrange = ceil(Int, elements / elements_per_thread)
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange,
                                                                   nothing)
 
     # this might not be the final context, since we may tune the workgroupsize
     ctx = KA.mkcontext(obj, ndrange, iterspace)
 
-    kernel = @oneapi launch=false f(ctx, args...)
+    kernel = @oneapi launch=false obj.f(ctx, args...)
 
     items = launch_configuration(kernel)
     # XXX: how many groups is a good number? the API doesn't tell us.

From b90254214428412ad057f0848658e7344e4747b8 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Tue, 23 Jul 2024 09:33:24 +0200
Subject: [PATCH 5/8] copying CUDA

---
 src/oneAPI.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index 5458c167..7dfa0532 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -17,6 +17,8 @@ using Core: LLVMPtr
 
 using SPIRV_LLVM_Translator_unified_jll, SPIRV_Tools_jll
 
+import KernelAbstractions as KA
+
 export oneL0
 
 # core library
@@ -69,7 +71,7 @@ include("broadcast.jl")
 include("mapreduce.jl")
 include("oneAPIKernels.jl")
 import .oneAPIKernels: oneAPIBackend
-export oneAPIBackend
+export oneAPIBackend, KA.launch_config
 
 include("gpuarrays.jl")
 include("random.jl")

From 867bd46a7f713fd486315d0b1fcbf95c2dbe6737 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Tue, 23 Jul 2024 10:40:57 +0200
Subject: [PATCH 6/8] take 2

---
 src/gpuarrays.jl | 2 +-
 src/oneAPI.jl    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index 0330464b..1cc39492 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -11,7 +11,7 @@ import KernelAbstractions: Backend
 ## execution
 
 @inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N};
-                                             elements::Int, elements_per_thread::Int) where {O,N}
+                                            elements::Int, elements_per_thread::Int) where {O,N}
     ndrange = ceil(Int, elements / elements_per_thread)
     ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange,
                                                                   nothing)
diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index 7dfa0532..00d83593 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -70,8 +70,8 @@ end
 include("broadcast.jl")
 include("mapreduce.jl")
 include("oneAPIKernels.jl")
-import .oneAPIKernels: oneAPIBackend
-export oneAPIBackend, KA.launch_config
+import .oneAPIKernels: oneAPIBackend, KA.launch_config
+export oneAPIBackend
 
 include("gpuarrays.jl")
 include("random.jl")

From 319382f0a994d3c5276611076b1875a87b7fe6f6 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 25 Jul 2024 15:23:22 +0200
Subject: [PATCH 7/8] removing heuristic

---
 src/gpuarrays.jl | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index 1cc39492..d15bac6f 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -1,8 +1,5 @@
 # GPUArrays.jl interface
 
-import KernelAbstractions
-import KernelAbstractions: Backend
-
 #
 # Device functionality
 #
@@ -10,24 +7,6 @@ import KernelAbstractions: Backend
 
 ## execution
 
-@inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N};
-                                            elements::Int, elements_per_thread::Int) where {O,N}
-    ndrange = ceil(Int, elements / elements_per_thread)
-    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange,
-                                                                  nothing)
-
-    # this might not be the final context, since we may tune the workgroupsize
-    ctx = KA.mkcontext(obj, ndrange, iterspace)
-
-    kernel = @oneapi launch=false obj.f(ctx, args...)
-
-    items = launch_configuration(kernel)
-    # XXX: how many groups is a good number? the API doesn't tell us.
-    #      measured on a low-end IGP, 32 blocks seems like a good sweet spot.
-    #      note that this only matters for grid-stride kernels, like broadcast.
-    return (threads=items, blocks=32)
-end
-
 const GLOBAL_RNGs = Dict{ZeDevice,GPUArrays.RNG}()
 function GPUArrays.default_rng(::Type{<:oneArray})
     dev = device()

From a8e255425ec3a1d06ec8299dfe8481692bf220f6 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 16 Sep 2024 14:07:33 +0200
Subject: [PATCH 8/8] Revert "removing heuristic"

This reverts commit 54796ad22c6b74dcce0d48f1c69ad5eb8b0a5219.
---
 src/gpuarrays.jl | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index d15bac6f..1cc39492 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -1,5 +1,8 @@
 # GPUArrays.jl interface
 
+import KernelAbstractions
+import KernelAbstractions: Backend
+
 #
 # Device functionality
 #
@@ -7,6 +10,24 @@
 
 ## execution
 
+@inline function GPUArrays.launch_heuristic(::oneAPIBackend, obj::O, args::Vararg{Any,N};
+                                            elements::Int, elements_per_thread::Int) where {O,N}
+    ndrange = ceil(Int, elements / elements_per_thread)
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange,
+                                                                  nothing)
+
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+
+    kernel = @oneapi launch=false obj.f(ctx, args...)
+
+    items = launch_configuration(kernel)
+    # XXX: how many groups is a good number? the API doesn't tell us.
+    #      measured on a low-end IGP, 32 blocks seems like a good sweet spot.
+    #      note that this only matters for grid-stride kernels, like broadcast.
+    return (threads=items, blocks=32)
+end
+
 const GLOBAL_RNGs = Dict{ZeDevice,GPUArrays.RNG}()
 function GPUArrays.default_rng(::Type{<:oneArray})
     dev = device()