diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 551b9a1eba..93ea9ea21c 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -43,7 +43,6 @@ steps:
   # then, test supported CUDA toolkits (installed through the artifact system)
   - group: "CUDA"
     key: "cuda"
-    depends_on: "julia"
     steps:
       - label: "CUDA {{matrix.cuda}}"
         plugins:
@@ -84,6 +83,33 @@ steps:
           echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
           echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml
 
+  - group: "Memory"
+    key: "memory"
+    steps:
+      - label: "CuArray with {{matrix.memory}} memory"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1.10"
+          - JuliaCI/julia-test#v1:
+              test_args: "--quickfail core base libraries"
+          - JuliaCI/julia-coverage#v1:
+              dirs:
+                - src
+                - lib
+                - examples
+        agents:
+          queue: "juliagpu"
+          cuda: "*"
+        if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip memory\]/ && !build.pull_request.draft
+        timeout_in_minutes: 30
+        matrix:
+          setup:
+            memory:
+              - "unified"
+              - "host"
+        commands: |
+          echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
+
   - group: ":nesting_dolls: Subpackages"
     depends_on: "cuda"
     steps:
@@ -121,6 +147,7 @@ steps:
             using Pkg
 
             println("--- :julia: Instantiating project")
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
               Pkg.activate(joinpath(pwd(), "lib", lowercase("{{matrix.package}}")))
               try
@@ -157,6 +184,7 @@ steps:
           julia --project -e '
             using Pkg
 
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             println("--- :julia: Instantiating project")
             withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
               Pkg.instantiate()
@@ -248,11 +276,15 @@ steps:
               run_tests: false
         command: |
           julia --project -e '
+            using Pkg
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             using CUDA
             @assert !CUDA.functional()
             @assert !isdefined(CUDA, :libcudart)
             CUDA.set_runtime_version!(v"11.6")'
           julia --project -e '
+            using Pkg
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             using CUDA
             @assert !CUDA.functional()
             @assert isdefined(CUDA, :libcudart)'
@@ -407,6 +439,7 @@ steps:
           julia --project -e '
             using Pkg
 
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             println("--- :julia: Instantiating project")
             Pkg.resolve()
             Pkg.instantiate()
@@ -441,6 +474,7 @@ steps:
         command: |
           julia --project -e '
             using Pkg
+            Pkg.add(; url="https://github.com/leios/GPUArrays.jl/", rev="yoyoyo_rebase_time")
             ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
 
             println("--- :julia: Instantiating project")
diff --git a/src/CUDA.jl b/src/CUDA.jl
index 92c5a1b7ee..1795b67ba8 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -43,6 +43,8 @@ using Libdl
 
 import NVTX
 
+import KernelAbstractions as KA
+
 using Printf
 
 
@@ -83,6 +85,11 @@ include("compiler/execution.jl")
 include("compiler/exceptions.jl")
 include("compiler/reflection.jl")
 
+# KernelAbstractions
+include("CUDAKernels.jl")
+import .CUDAKernels: CUDABackend, KA.launch_config
+export CUDABackend
+
 # array implementation
 include("gpuarrays.jl")
 include("utilities.jl")
@@ -111,6 +118,9 @@ export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND
 const has_cusolvermg = CUSOLVER.has_cusolvermg
 export has_cusolvermg
 
+# KA Backend Definition
+KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()
+
 # random depends on CURAND
 include("random.jl")
 
@@ -119,11 +129,6 @@ include("../lib/nvml/NVML.jl")
 const has_nvml = NVML.has_nvml
 export NVML, has_nvml
 
-# KernelAbstractions
-include("CUDAKernels.jl")
-import .CUDAKernels: CUDABackend
-export CUDABackend
-
 # StaticArrays is still a direct dependency, so directly include the extension
 include("../ext/StaticArraysExt.jl")
 
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 23db958933..8298f74426 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -25,7 +25,6 @@ KA.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims)
 KA.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims)
 
 KA.get_backend(::CuArray) = CUDABackend()
-KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()
 KA.synchronize(::CUDABackend) = synchronize()
 
 Adapt.adapt_storage(::CUDABackend, a::Array) = Adapt.adapt(CuArray, a)
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index e3aec24c3f..54f6731c6c 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -1,6 +1,5 @@
 # GPUArrays.jl interface
 
-
 #
 # Device functionality
 #
@@ -8,13 +7,13 @@
 
 ## execution
 
-struct CuArrayBackend <: AbstractGPUBackend end
-
-struct CuKernelContext <: AbstractKernelContext end
+@inline function GPUArrays.launch_heuristic(::CUDABackend, obj::O, args::Vararg{Any,N};
+                                            elements::Int, elements_per_thread::Int) where {O,N}
 
-@inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N};
-                                            elements::Int, elements_per_thread::Int) where {F,N}
-    kernel = @cuda launch=false f(CuKernelContext(), args...)
+    ndrange = ceil(Int, elements / elements_per_thread)
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, nothing)
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+    kernel = @cuda launch=false obj.f(ctx, args...)
 
     # launching many large blocks) lowers performance, as observed with broadcast, so cap
     # the block size if we don't have a grid-stride kernel (which would keep the grid small)
@@ -24,39 +23,3 @@ struct CuKernelContext <: AbstractKernelContext end
         launch_configuration(kernel.fun; max_threads=256)
     end
 end
-
-@inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int,
-                                    blocks::Int; name::Union{String,Nothing}) where {F,TT}
-    @cuda threads blocks name f(CuKernelContext(), args...)
-end
-
-
-## on-device
-
-# indexing
-
-GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x
-GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x
-GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x
-GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x
-
-# memory
-
-@inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
-                                      ) where {T, dims, id}
-    ptr = CUDA._shmem(Val(id), T, Val(prod(dims)))
-    ptr = reinterpret(LLVMPtr{T, AS.Shared}, ptr)
-    CuDeviceArray{T,length(dims),AS.Shared}(ptr, dims)
-end
-
-# synchronization
-
-@inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads()
-
-
-
-#
-# Host abstractions
-#
-
-GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()