From b828c8d27e22a776e142cc02016a3e44010e7575 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 4 Apr 2024 13:06:24 -0400
Subject: [PATCH 1/8] new GPUArrays interface for KA transition

---
 src/gpuarrays.jl | 42 +++---------------------------------------
 1 file changed, 3 insertions(+), 39 deletions(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index e3aec24c3f..ec0771c706 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -1,5 +1,7 @@
 # GPUArrays.jl interface
 
+import KernelAbstractions
+import KernelAbstractions: Backend
 
 #
 # Device functionality
@@ -8,9 +10,7 @@
 
 ## execution
 
-struct CuArrayBackend <: AbstractGPUBackend end
-
-struct CuKernelContext <: AbstractKernelContext end
+struct CuArrayBackend <: Backend end
 
 @inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N};
                                             elements::Int, elements_per_thread::Int) where {F,N}
@@ -24,39 +24,3 @@ struct CuKernelContext <: AbstractKernelContext end
         launch_configuration(kernel.fun; max_threads=256)
     end
 end
-
-@inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int,
-                                    blocks::Int; name::Union{String,Nothing}) where {F,TT}
-    @cuda threads blocks name f(CuKernelContext(), args...)
-end
-
-
-## on-device
-
-# indexing
-
-GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x
-GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x
-GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x
-GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x
-
-# memory
-
-@inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
-                                      ) where {T, dims, id}
-    ptr = CUDA._shmem(Val(id), T, Val(prod(dims)))
-    ptr = reinterpret(LLVMPtr{T, AS.Shared}, ptr)
-    CuDeviceArray{T,length(dims),AS.Shared}(ptr, dims)
-end
-
-# synchronization
-
-@inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads()
-
-
-
-#
-# Host abstractions
-#
-
-GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()

From 9590be3395273ef3c2d42f61e08d9de72744f2ef Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Tue, 23 Apr 2024 13:35:59 +0200
Subject: [PATCH 2/8] buildkite: remove this before merging

---
 .buildkite/pipeline.yml | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 551b9a1eba..93ea9ea21c 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -43,7 +43,6 @@ steps:
   # then, test supported CUDA toolkits (installed through the artifact system)
   - group: "CUDA"
     key: "cuda"
-    depends_on: "julia"
     steps:
       - label: "CUDA {{matrix.cuda}}"
         plugins:
@@ -84,6 +83,33 @@ steps:
           echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
           echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml
 
+  - group: "Memory"
+    key: "memory"
+    steps:
+      - label: "CuArray with {{matrix.memory}} memory"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1.10"
+          - JuliaCI/julia-test#v1:
+              test_args: "--quickfail core base libraries"
+          - JuliaCI/julia-coverage#v1:
+              dirs:
+                - src
+                - lib
+                - examples
+        agents:
+          queue: "juliagpu"
+          cuda: "*"
+        if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip memory\]/ && !build.pull_request.draft
+        timeout_in_minutes: 30
+        matrix:
+          setup:
+            memory:
+              - "unified"
+              - "host"
+        commands: |
+          echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
+
   - group: ":nesting_dolls: Subpackages"
     depends_on: "cuda"
     steps:
@@ -121,6 +147,7 @@ steps:
             using Pkg
 
             println("--- :julia: Instantiating project")
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
               Pkg.activate(joinpath(pwd(), "lib", lowercase("{{matrix.package}}")))
               try
@@ -157,6 +184,7 @@ steps:
           julia --project -e '
             using Pkg
 
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             println("--- :julia: Instantiating project")
             withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
               Pkg.instantiate()
@@ -248,11 +276,15 @@ steps:
               run_tests: false
         command: |
           julia --project -e '
+            using Pkg
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             using CUDA
             @assert !CUDA.functional()
             @assert !isdefined(CUDA, :libcudart)
             CUDA.set_runtime_version!(v"11.6")'
           julia --project -e '
+            using Pkg
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             using CUDA
             @assert !CUDA.functional()
             @assert isdefined(CUDA, :libcudart)'
@@ -407,6 +439,7 @@ steps:
           julia --project -e '
             using Pkg
 
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             println("--- :julia: Instantiating project")
             Pkg.resolve()
             Pkg.instantiate()
@@ -441,6 +474,7 @@ steps:
         command: |
           julia --project -e '
             using Pkg
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
 
             println("--- :julia: Instantiating project")

From fc151cda854f96ae656854125d309e5a88a8ae30 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 22 Jul 2024 11:26:31 +0200
Subject: [PATCH 3/8] CuArrayBackend -> CUDABackend

---
 src/CUDA.jl        | 13 ++++++++-----
 src/CUDAKernels.jl |  1 -
 src/gpuarrays.jl   | 14 ++++++++------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/CUDA.jl b/src/CUDA.jl
index 92c5a1b7ee..83322dbaad 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -83,6 +83,11 @@ include("compiler/execution.jl")
 include("compiler/exceptions.jl")
 include("compiler/reflection.jl")
 
+# KernelAbstractions
+include("CUDAKernels.jl")
+import .CUDAKernels: CUDABackend, KA
+export CUDABackend
+
 # array implementation
 include("gpuarrays.jl")
 include("utilities.jl")
@@ -111,6 +116,9 @@ export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND
 const has_cusolvermg = CUSOLVER.has_cusolvermg
 export has_cusolvermg
 
+# KA Backend Definition
+KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()
+
 # random depends on CURAND
 include("random.jl")
 
@@ -119,11 +127,6 @@ include("../lib/nvml/NVML.jl")
 const has_nvml = NVML.has_nvml
 export NVML, has_nvml
 
-# KernelAbstractions
-include("CUDAKernels.jl")
-import .CUDAKernels: CUDABackend
-export CUDABackend
-
 # StaticArrays is still a direct dependency, so directly include the extension
 include("../ext/StaticArraysExt.jl")
 
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 23db958933..8298f74426 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -25,7 +25,6 @@ KA.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims)
 KA.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims)
 
 KA.get_backend(::CuArray) = CUDABackend()
-KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()
 KA.synchronize(::CUDABackend) = synchronize()
 
 Adapt.adapt_storage(::CUDABackend, a::Array) = Adapt.adapt(CuArray, a)
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index ec0771c706..9bb1e07a78 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -1,8 +1,5 @@
 # GPUArrays.jl interface
 
-import KernelAbstractions
-import KernelAbstractions: Backend
-
 #
 # Device functionality
 #
@@ -10,11 +7,16 @@ import KernelAbstractions: Backend
 
 ## execution
 
-struct CuArrayBackend <: Backend end
 
-@inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N};
+@inline function GPUArrays.launch_heuristic(::CUDABackend, f::F, args::Vararg{Any,N};
                                             elements::Int, elements_per_thread::Int) where {F,N}
-    kernel = @cuda launch=false f(CuKernelContext(), args...)
+
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing,
+                                                                  nothing)
+
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+    kernel = @cuda launch=false f(ctx, args...)
 
     # launching many large blocks) lowers performance, as observed with broadcast, so cap
     # the block size if we don't have a grid-stride kernel (which would keep the grid small)

From ca1c6893223ee7ccdecc105713465c500684bede Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 22 Jul 2024 10:28:02 -0400
Subject: [PATCH 4/8] attempting a new launch_heuristic

---
 src/CUDA.jl      | 4 +++-
 src/gpuarrays.jl | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/CUDA.jl b/src/CUDA.jl
index 83322dbaad..1795b67ba8 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -43,6 +43,8 @@ using Libdl
 
 import NVTX
 
+import KernelAbstractions as KA
+
 using Printf
 
 
@@ -85,7 +87,7 @@ include("compiler/reflection.jl")
 
 # KernelAbstractions
 include("CUDAKernels.jl")
-import .CUDAKernels: CUDABackend, KA
+import .CUDAKernels: CUDABackend, KA.launch_config
 export CUDABackend
 
 # array implementation
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index 9bb1e07a78..c5ab1e79cb 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -11,12 +11,13 @@
 @inline function GPUArrays.launch_heuristic(::CUDABackend, f::F, args::Vararg{Any,N};
                                             elements::Int, elements_per_thread::Int) where {F,N}
 
+    obj = f(CUDABackend())
     ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing,
                                                                   nothing)
 
     # this might not be the final context, since we may tune the workgroupsize
     ctx = KA.mkcontext(obj, ndrange, iterspace)
-    kernel = @cuda launch=false f(ctx, args...)
+    kernel = @cuda launch=false obj.f(ctx, args...)
 
     # launching many large blocks) lowers performance, as observed with broadcast, so cap
     # the block size if we don't have a grid-stride kernel (which would keep the grid small)

From aeaa29d47047979c936d4ed3a047b67646f7ac95 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 22 Jul 2024 10:52:12 -0400
Subject: [PATCH 5/8] meh

---
 src/gpuarrays.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index c5ab1e79cb..ffe48d9845 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -12,7 +12,8 @@
                                             elements::Int, elements_per_thread::Int) where {F,N}
 
     obj = f(CUDABackend())
-    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing,
+    ndrange = ceil(Int, elements / elements_per_thread)
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange,
                                                                   nothing)
 
     # this might not be the final context, since we may tune the workgroupsize

From 0a0c211d40fc92d1e29525cbe0431537bd936972 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 22 Jul 2024 17:48:59 -0400
Subject: [PATCH 6/8] I think the tests will pass now

---
 src/gpuarrays.jl | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index ffe48d9845..54f6731c6c 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -7,16 +7,11 @@
 
 ## execution
 
+@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N};
+                                            elements::Int, elements_per_thread::Int) where {O,N}
 
-@inline function GPUArrays.launch_heuristic(::CUDABackend, f::F, args::Vararg{Any,N};
-                                            elements::Int, elements_per_thread::Int) where {F,N}
-
-    obj = f(CUDABackend())
     ndrange = ceil(Int, elements / elements_per_thread)
-    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange,
-                                                                  nothing)
-
-    # this might not be the final context, since we may tune the workgroupsize
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing)
     ctx = KA.mkcontext(obj, ndrange, iterspace)
     kernel = @cuda launch=false obj.f(ctx, args...)
 

From d1d446fadc5bf46d3c9723c3af805b9c8c6c7797 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 25 Jul 2024 15:22:45 +0200
Subject: [PATCH 7/8] removing launch_heuristic

---
 src/CUDA.jl      |  1 -
 src/gpuarrays.jl | 25 -------------------------
 2 files changed, 26 deletions(-)
 delete mode 100644 src/gpuarrays.jl

diff --git a/src/CUDA.jl b/src/CUDA.jl
index 1795b67ba8..12f70679b3 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -91,7 +91,6 @@ import .CUDAKernels: CUDABackend, KA.launch_config
 export CUDABackend
 
 # array implementation
-include("gpuarrays.jl")
 include("utilities.jl")
 include("texture.jl")
 
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
deleted file mode 100644
index 54f6731c6c..0000000000
--- a/src/gpuarrays.jl
+++ /dev/null
@@ -1,25 +0,0 @@
-# GPUArrays.jl interface
-
-#
-# Device functionality
-#
-
-
-## execution
-
-@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N};
-                                            elements::Int, elements_per_thread::Int) where {O,N}
-
-    ndrange = ceil(Int, elements / elements_per_thread)
-    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing)
-    ctx = KA.mkcontext(obj, ndrange, iterspace)
-    kernel = @cuda launch=false obj.f(ctx, args...)
-
-    # launching many large blocks) lowers performance, as observed with broadcast, so cap
-    # the block size if we don't have a grid-stride kernel (which would keep the grid small)
-    if elements_per_thread > 1
-        launch_configuration(kernel.fun)
-    else
-        launch_configuration(kernel.fun; max_threads=256)
-    end
-end

From c7831bcbb3fea8c76bf0c8a0c12b204e81e7bc55 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 16 Sep 2024 14:06:16 +0200
Subject: [PATCH 8/8] Revert "removing launch_heuristic"

This reverts commit d1d446fadc5bf46d3c9723c3af805b9c8c6c7797.
---
 src/CUDA.jl      |  1 +
 src/gpuarrays.jl | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 src/gpuarrays.jl

diff --git a/src/CUDA.jl b/src/CUDA.jl
index 12f70679b3..1795b67ba8 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -91,6 +91,7 @@ import .CUDAKernels: CUDABackend, KA.launch_config
 export CUDABackend
 
 # array implementation
+include("gpuarrays.jl")
 include("utilities.jl")
 include("texture.jl")
 
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
new file mode 100644
index 0000000000..54f6731c6c
--- /dev/null
+++ b/src/gpuarrays.jl
@@ -0,0 +1,25 @@
+# GPUArrays.jl interface
+
+#
+# Device functionality
+#
+
+
+## execution
+
+@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N};
+                                            elements::Int, elements_per_thread::Int) where {O,N}
+
+    ndrange = ceil(Int, elements / elements_per_thread)
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing)
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+    kernel = @cuda launch=false obj.f(ctx, args...)
+
+    # launching many large blocks) lowers performance, as observed with broadcast, so cap
+    # the block size if we don't have a grid-stride kernel (which would keep the grid small)
+    if elements_per_thread > 1
+        launch_configuration(kernel.fun)
+    else
+        launch_configuration(kernel.fun; max_threads=256)
+    end
+end