JuliaGPU
diff --git a/‎Project.toml
Lines changed: 1 addition & 0 deletions b/‎Project.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/src/interface.md
Lines changed: 0 additions & 2 deletions b/‎docs/src/interface.md
Lines changed: 0 additions & 2 deletions
diff --git a/‎lib/GPUArraysCore/src/GPUArraysCore.jl
Lines changed: 3 additions & 3 deletions b/‎lib/GPUArraysCore/src/GPUArraysCore.jl
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/JLArrays/Project.toml
Lines changed: 2 additions & 1 deletion b/‎lib/JLArrays/Project.toml
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/JLArrays/src/JLArrays.jl
Lines changed: 79 additions & 125 deletions b/‎lib/JLArrays/src/JLArrays.jl
Lines changed: 79 additions & 125 deletions
diff --git a/‎src/GPUArrays.jl
Lines changed: 2 additions & 5 deletions b/‎src/GPUArrays.jl
Lines changed: 2 additions & 5 deletions
diff --git a/‎src/device/execution.jl
Lines changed: 37 additions & 6 deletions b/‎src/device/execution.jl
Lines changed: 37 additions & 6 deletions
@@ -5,6 +5,7 @@ version = "10.0.2"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
@@ -13,8 +13,6 @@ all, you need to provide a type that represents your execution back-end and a wa
 kernels:
 
 ```@docs
-GPUArrays.AbstractGPUBackend
-GPUArrays.AbstractKernelContext
 GPUArrays.gpu_call
 GPUArrays.thread_block_heuristic
 ```
 
@@ -222,10 +222,10 @@ end
 
 Gets the GPUArrays back-end responsible for managing arrays of type `T`.
 """
-backend(::Type) = error("This object is not a GPU array") # COV_EXCL_LINE
-backend(x) = backend(typeof(x))
+get_backend(::Type) = error("This object is not a GPU array") # COV_EXCL_LINE
+get_backend(x) = get_backend(typeof(x))
 
 # WrappedArray from Adapt for Base wrappers.
-backend(::Type{WA}) where WA<:WrappedArray = backend(unwrap_type(WA))
+get_backend(::Type{WA}) where WA<:WrappedArray = backend(unwrap_type(WA))
 
 end # module GPUArraysCore
@@ -6,10 +6,11 @@ version = "0.1.4"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [compat]
 Adapt = "2.0, 3.0, 4.0"
 GPUArrays = "10"
-julia = "1.8"
 Random = "1"
+julia = "1.8"
@@ -1,54 +1,23 @@
 # reference implementation on the CPU
-
-# note that most of the code in this file serves to define a functional array type,
-# the actual implementation of GPUArrays-interfaces is much more limited.
+# This acts as a wrapper around KernelAbstractions's parallel CPU
+# functionality. It is useful for testing GPUArrays (and other packages) 
+# when no GPU is present.
+# This file follows conventions from AMDGPU.jl
 
 module JLArrays
 
-export JLArray, JLVector, JLMatrix, jl
-
 using GPUArrays
-
 using Adapt
+import KernelAbstractions
+import KernelAbstractions: Adapt, StaticArrays, Backend, Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
 
+export JLArray, JLVector, JLMatrix, jl, JLBackend
 
-#
-# Device functionality
-#
-
-const MAXTHREADS = 256
-
-
-## execution
-
-struct JLBackend <: AbstractGPUBackend end
-
-mutable struct JLKernelContext <: AbstractKernelContext
-    blockdim::Int
-    griddim::Int
-    blockidx::Int
-    threadidx::Int
-
-    localmem_counter::Int
-    localmems::Vector{Vector{Array}}
+struct JLBackend <: KernelAbstractions.GPU
+    static::Bool
+    JLBackend(;static::Bool=false) = new(static)
 end
 
-function JLKernelContext(threads::Int, blockdim::Int)
-    blockcount = prod(blockdim)
-    lmems = [Vector{Array}() for i in 1:blockcount]
-    JLKernelContext(threads, blockdim, 1, 1, 0, lmems)
-end
-
-function JLKernelContext(ctx::JLKernelContext, threadidx::Int)
-    JLKernelContext(
-        ctx.blockdim,
-        ctx.griddim,
-        ctx.blockidx,
-        threadidx,
-        0,
-        ctx.localmems
-    )
-end
 
 struct Adaptor end
 jlconvert(arg) = adapt(Adaptor(), arg)
@@ -60,28 +29,35 @@ end
 Base.getindex(r::JlRefValue) = r.x
 Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[]))
 
-function GPUArrays.gpu_call(::JLBackend, f, args, threads::Int, blocks::Int;
-                            name::Union{String,Nothing})
-    ctx = JLKernelContext(threads, blocks)
-    device_args = jlconvert.(args)
-    tasks = Array{Task}(undef, threads)
-    for blockidx in 1:blocks
-        ctx.blockidx = blockidx
-        for threadidx in 1:threads
-            thread_ctx = JLKernelContext(ctx, threadidx)
-            tasks[threadidx] = @async f(thread_ctx, device_args...)
-            # TODO: require 1.3 and use Base.Threads.@spawn for actual multithreading
-            #       (this would require a different synchronization mechanism)
-        end
-        for t in tasks
-            fetch(t)
-        end
+mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
+    data::DataRef{Vector{UInt8}}
+
+    offset::Int        # offset of the data in the buffer, in number of elements
+
+    dims::Dims{N}
+
+    # allocating constructor
+    function JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N}
+        check_eltype(T)
+        maxsize = prod(dims) * sizeof(T)
+        data = Vector{UInt8}(undef, maxsize)
+        ref = DataRef(data)
+        obj = new{T,N}(ref, 0, dims)
+        finalizer(unsafe_free!, obj)
     end
-    return
-end
 
+    # low-level constructor for wrapping existing data
+    function JLArray{T,N}(ref::DataRef{Vector{UInt8}}, dims::Dims{N};
+                          offset::Int=0) where {T,N}
+        check_eltype(T)
+        obj = new{T,N}(ref, offset, dims)
+        finalizer(unsafe_free!, obj)
+    end
+end
 
-## executed on-device
+Adapt.adapt_storage(::JLBackend, a::Array) = Adapt.adapt(JLArrays.JLArray, a)
+Adapt.adapt_storage(::JLBackend, a::JLArrays.JLArray) = a
+Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Array, a)
 
 # array type
 
@@ -107,43 +83,6 @@ end
 @inline Base.getindex(A::JLDeviceArray, index::Integer) = getindex(typed_data(A), index)
 @inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(typed_data(A), x, index)
 
-
-# indexing
-
-for f in (:blockidx, :blockdim, :threadidx, :griddim)
-    @eval GPUArrays.$f(ctx::JLKernelContext) = ctx.$f
-end
-
-# memory
-
-function GPUArrays.LocalMemory(ctx::JLKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
-    ctx.localmem_counter += 1
-    lmems = ctx.localmems[blockidx(ctx)]
-
-    # first invocation in block
-    data = if length(lmems) < ctx.localmem_counter
-        lmem = fill(zero(T), dims)
-        push!(lmems, lmem)
-        lmem
-    else
-        lmems[ctx.localmem_counter]
-    end
-
-    N = length(dims)
-    JLDeviceArray{T,N}(data, tuple(dims...))
-end
-
-# synchronization
-
-@inline function GPUArrays.synchronize_threads(::JLKernelContext)
-    # All threads are getting started asynchronously, so a yield will yield to the next
-    # execution of the same function, which should call yield at the exact same point in the
-    # program, leading to a chain of yields effectively syncing the tasks (threads).
-    yield()
-    return
-end
-
-
 #
 # Host abstractions
 #
@@ -157,32 +96,6 @@ function check_eltype(T)
   end
 end
 
-mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
-    data::DataRef{Vector{UInt8}}
-
-    offset::Int        # offset of the data in the buffer, in number of elements
-
-    dims::Dims{N}
-
-    # allocating constructor
-    function JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N}
-        check_eltype(T)
-        maxsize = prod(dims) * sizeof(T)
-        data = Vector{UInt8}(undef, maxsize)
-        ref = DataRef(data)
-        obj = new{T,N}(ref, 0, dims)
-        finalizer(unsafe_free!, obj)
-    end
-
-    # low-level constructor for wrapping existing data
-    function JLArray{T,N}(ref::DataRef{Vector{UInt8}}, dims::Dims{N};
-                          offset::Int=0) where {T,N}
-        check_eltype(T)
-        obj = new{T,N}(ref, offset, dims)
-        finalizer(unsafe_free!, obj)
-    end
-end
-
 unsafe_free!(a::JLArray) = GPUArrays.unsafe_free!(a.data)
 
 # conversion of untyped data to a typed Array
@@ -392,8 +305,6 @@ end
 
 ## GPUArrays interfaces
 
-GPUArrays.backend(::Type{<:JLArray}) = JLBackend()
-
 Adapt.adapt_storage(::Adaptor, x::JLArray{T,N}) where {T,N} =
   JLDeviceArray{T,N}(x.data[], x.offset, x.dims)
 
@@ -406,4 +317,47 @@ function GPUArrays.mapreducedim!(f, op, R::AnyJLArray, A::Union{AbstractArray,Br
     R
 end
 
+## KernelAbstractions interface
+
+KernelAbstractions.get_backend(a::JLA) where JLA <: JLArray = JLBackend()
+
+function KernelAbstractions.mkcontext(kernel::Kernel{JLBackend}, I, _ndrange, iterspace, ::Dynamic) where Dynamic
+    return KernelAbstractions.CompilerMetadata{KernelAbstractions.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
+end
+
+KernelAbstractions.allocate(::JLBackend, ::Type{T}, dims::Tuple) where T = JLArray{T}(undef, dims)
+
+@inline function launch_config(kernel::Kernel{JLBackend}, ndrange, workgroupsize)
+    if ndrange isa Integer
+        ndrange = (ndrange,)
+    end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
+
+    if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing
+        workgroupsize = (1024,) # Vectorization, 4x unrolling, minimal grain size
+    end
+    iterspace, dynamic = partition(kernel, ndrange, workgroupsize)
+    # partition checked that the ndrange's agreed
+    if KernelAbstractions.ndrange(kernel) <: StaticSize
+        ndrange = nothing
+    end
+
+    return ndrange, workgroupsize, iterspace, dynamic
+end
+
+KernelAbstractions.isgpu(b::JLBackend) = false
+
+function convert_to_cpu(obj::Kernel{JLBackend, W, N, F}) where {W, N, F}
+    return Kernel{typeof(KernelAbstractions.CPU(; static = obj.backend.static)), W, N, F}(KernelAbstractions.CPU(; static = obj.backend.static), obj.f)
+end
+
+function (obj::Kernel{JLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    device_args = jlconvert.(args)
+    new_obj = convert_to_cpu(obj)
+    new_obj(device_args...; ndrange, workgroupsize)
+
+end
+
 end
@@ -1,5 +1,6 @@
 module GPUArrays
 
+using KernelAbstractions
 using Serialization
 using Random
 using LinearAlgebra
@@ -14,13 +15,9 @@ using LLVM.Interop
 using Reexport
 @reexport using GPUArraysCore
 
-# device functionality
-include("device/execution.jl")
 ## executed on-device
+include("device/execution.jl")
 include("device/abstractarray.jl")
-include("device/indexing.jl")
-include("device/memory.jl")
-include("device/synchronization.jl")
 
 using KernelAbstractions
 # host abstractions
 
@@ -1,8 +1,39 @@
 # kernel execution
 
-# TODO:
-# - Rename KA device to backend
-# - Who owns `AbstractGPUBackend`?
-#   a; KernelAbstractions
-#   b; GPUArraysCore
-backend(a) = KernelAbstractions.get_device(a)
+# how many threads and blocks `kernel` needs to be launched with, passing arguments `args`,
+# to fully saturate the GPU. `elements` indicates the number of elements that needs to be
+# processed, while `elements_per_threads` indicates the number of elements this kernel can
+# process (i.e. if it's a grid-stride kernel, or 1 if otherwise).
+#
+# this heuristic should be specialized for the back-end, ideally using an API for maximizing
+# the occupancy of the launch configuration (like CUDA's occupancy API).
+function launch_heuristic(backend::B, kernel, args...;
+                          elements::Int,
+                          elements_per_thread::Int) where B <: Backend
+    return (threads=256, blocks=32)
+end
+
+# determine how many threads and blocks to actually launch given upper limits.
+# returns a tuple of blocks, threads, and elements_per_thread (which is always 1
+# unless specified that the kernel can handle a number of elements per thread)
+function launch_configuration(backend::B, heuristic;
+                              elements::Int,
+                              elements_per_thread::Int) where B <: Backend
+    threads = clamp(elements, 1, heuristic.threads)
+    blocks = max(cld(elements, threads), 1)
+
+    if elements_per_thread > 1 && blocks > heuristic.blocks
+        # we want to launch more blocks than required, so prefer a grid-stride loop instead
+        ## try to stick to the number of blocks that the heuristic suggested
+        blocks = heuristic.blocks
+        nelem = cld(elements, blocks*threads)
+        ## only bump the number of blocks if we really need to
+        if nelem > elements_per_thread
+            nelem = elements_per_thread
+            blocks = cld(elements, nelem*threads)
+        end
+        (; threads, blocks, elements_per_thread=nelem)
+    else
+        (; threads, blocks, elements_per_thread=1)
+    end
+end