JuliaGPU
diff --git a/‎examples/performance.jl
Lines changed: 111 additions & 0 deletions b/‎examples/performance.jl
Lines changed: 111 additions & 0 deletions
diff --git a/‎src/KernelAbstractions.jl
Lines changed: 34 additions & 56 deletions b/‎src/KernelAbstractions.jl
Lines changed: 34 additions & 56 deletions
diff --git a/‎src/backends/cpu.jl
Lines changed: 46 additions & 42 deletions b/‎src/backends/cpu.jl
Lines changed: 46 additions & 42 deletions
@@ -0,0 +1,111 @@
+using KernelAbstractions
+using CUDAapi
+
+CUDAapi.has_cuda_gpu() || exit()
+
+using CuArrays
+using CUDAdrv
+using CUDAnative
+using CUDAnative.NVTX
+
+@kernel function transpose_kernel_naive!(b, a)
+    I = @index(Global, Cartesian)
+    i, j = I.I 
+    @inbounds b[i, j] = a[j, i]
+end
+
+const block_dim = 32
+const grid_dim = 256
+
+@kernel function transpose_kernel!(b, a)
+    block_dim_x, block_dim_y = block_dim, block_dim
+    grid_dim_x, grid_dim_y = grid_dim, grid_dim
+
+    wgsize = prod(groupsize())
+    
+    I = @index(Global)
+    L = @index(Local)
+    G = div(I - 1, wgsize) + 1
+  
+    thread_idx_x = (L - 1) % block_dim_x + 1
+    thread_idx_y = div(L - 1, block_dim_x) + 1
+    
+    block_idx_x = (G - 1) % grid_dim_x + 1
+    block_idx_y = div(G - 1, grid_dim_x) + 1
+  
+    i = (block_idx_x - 1) * block_dim_x + thread_idx_x
+    j = (block_idx_y - 1) * block_dim_y + thread_idx_y
+  
+    @inbounds b[i + size(b, 1) * (j - 1)] = a[j + size(a, 1) * (i - 1)]
+end
+
+const T = Float32
+const N = grid_dim * block_dim
+const shape = N, N
+const nreps = 10
+
+NVTX.@range "Naive transpose $block_dim, $block_dim" let
+    a = CuArray(rand(T, shape))
+    b = similar(a, shape[2], shape[1])
+    kernel! = transpose_kernel_naive!(CUDA(), (block_dim, block_dim), size(b))
+  
+    event = kernel!(b, a)
+    wait(event)
+    @assert Array(b) == Array(a)'
+    @CUDAdrv.profile begin
+        for rep in 1:nreps
+          event = kernel!(b, a, dependencies=(event,))
+        end
+        wait(event)
+    end
+end
+
+NVTX.@range "Naive transpose $(block_dim^2), 1" let
+    a = CuArray(rand(T, shape))
+    b = similar(a, shape[2], shape[1])
+    kernel! = transpose_kernel_naive!(CUDA(), (block_dim*block_dim, 1), size(b))
+  
+    event = kernel!(b, a)
+    wait(event)
+    @assert Array(b) == Array(a)'
+    @CUDAdrv.profile begin
+        for rep in 1:nreps
+          event = kernel!(b, a, dependencies=(event,))
+        end
+        wait(event)
+    end
+end
+
+NVTX.@range "Naive transpose 1, $(block_dim^2)" let
+    a = CuArray(rand(T, shape))
+    b = similar(a, shape[2], shape[1])
+    kernel! = transpose_kernel_naive!(CUDA(), (1, block_dim*block_dim), size(b))
+  
+    event = kernel!(b, a)
+    wait(event)
+    @assert Array(b) == Array(a)'
+    @CUDAdrv.profile begin
+        for rep in 1:nreps
+          event = kernel!(b, a, dependencies=(event,))
+        end
+        wait(event)
+    end
+end
+
+NVTX.@range "Baseline transpose" let
+    a = CuArray(rand(T, shape))
+    b = similar(a, shape[2], shape[1])
+    
+    kernel! = transpose_kernel!(CUDA(), (block_dim*block_dim), length(b))
+  
+    event = kernel!(b, a)
+    wait(event)
+    @assert Array(b) == Array(a)'
+    @CUDAdrv.profile begin
+        for rep in 1:nreps
+          event = kernel!(b, a, dependencies=(event,))
+        end
+        wait(event)
+    end
+end
+
@@ -76,7 +76,9 @@ function async_copy! end
 """
     groupsize()
 
-Query the workgroupsize on the device.
+Query the workgroupsize on the device. This function returns
+a tuple corresponding to kernel configuration. In order to get
+the total size you can use `prod(groupsize())`.
 """
 function groupsize end
 
@@ -131,10 +133,6 @@ macro index(locale, args...)
         indexkind = :Linear
     end
 
-    if indexkind === :Cartesian && locale === :Local
-        error("@index(Local, Cartesian) is not implemented yet") 
-    end
-    
     index_function = Symbol(:__index_, locale, :_, indexkind)
     Expr(:call, GlobalRef(KernelAbstractions, index_function), map(esc, args)...)
 end
@@ -167,31 +165,14 @@ struct CUDA <: GPU end
 # struct AMD <: GPU end
 # struct Intel <: GPU end
 
+include("nditeration.jl")
+using .NDIteration
+import .NDIteration: get
+
 ###
 # Kernel closure struct
 ###
 
-import Base.@pure
-
-abstract type _Size end
-struct DynamicSize <: _Size end
-struct StaticSize{S} <: _Size
-    function StaticSize{S}() where S
-        new{S::Tuple{Vararg{Int}}}()
-    end
-end
-
-@pure StaticSize(s::Tuple{Vararg{Int}}) = StaticSize{s}() 
-@pure StaticSize(s::Int...) = StaticSize{s}() 
-@pure StaticSize(s::Type{<:Tuple}) = StaticSize{tuple(s.parameters...)}()
-
-# Some @pure convenience functions for `StaticSize`
-@pure get(::Type{StaticSize{S}}) where {S} = S
-@pure get(::StaticSize{S}) where {S} = S
-@pure Base.getindex(::StaticSize{S}, i::Int) where {S} = i <= length(S) ? S[i] : 1
-@pure Base.ndims(::StaticSize{S}) where {S} = length(S)
-@pure Base.length(::StaticSize{S}) where {S} = prod(S)
-
 """
     Kernel{Device, WorkgroupSize, NDRange, Func}
 
@@ -206,14 +187,7 @@ end
 workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
 ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize,NDRange} = NDRange
 
-"""
-    partition(kernel, ndrange)
-
-Splits the maximum size of the iteration space by the workgroupsize.
-Returns the number of workgroups necessary and whether the last workgroup
-needs to perform dynamic bounds-checking.
-"""
-@inline function partition(kernel::Kernel, ndrange, workgroupsize)
+function partition(kernel, ndrange, workgroupsize)
     static_ndrange = KernelAbstractions.ndrange(kernel)
     static_workgroupsize = KernelAbstractions.workgroupsize(kernel)
 
@@ -225,42 +199,49 @@ needs to perform dynamic bounds-checking.
             You created a dynamically sized kernel, but forgot to provide runtime
             parameters for the kernel. Either provide them statically if known
             or dynamically.
-            NDRange(Static):  $(typeof(static_ndrange))
+            NDRange(Static):  $(static_ndrange)
             NDRange(Dynamic): $(ndrange)
-            Workgroupsize(Static):  $(typeof(static_workgroupsize))
+            Workgroupsize(Static):  $(static_workgroupsize)
             Workgroupsize(Dynamic): $(workgroupsize)
         """
         error(errmsg)
     end
 
-    if ndrange !== nothing && static_ndrange <: StaticSize
-        if prod(ndrange) != prod(get(static_ndrange))
-            error("Static NDRange and launch NDRange differ")
+    if static_ndrange <: StaticSize
+        if ndrange !== nothing && ndrange != get(static_ndrange)
+            error("Static NDRange ($static_ndrange) and launch NDRange ($ndrange) differ")
         end
+        ndrange = get(static_ndrange)
     end
 
     if static_workgroupsize <: StaticSize
-        @assert length(get(static_workgroupsize)) === 1
-        static_workgroupsize = get(static_workgroupsize)[1]
-        if workgroupsize !== nothing && workgroupsize != static_workgroupsize
-            error("Static WorkgroupSize and launch WorkgroupSize differ")
+        if workgroupsize !== nothing && workgroupsize != get(static_workgroupsize)
+            error("Static WorkgroupSize ($static_workgroupsize) and launch WorkgroupSize $(workgroupsize) differ")
         end
-        workgroupsize = static_workgroupsize
+        workgroupsize = get(static_workgroupsize)
     end
+
     @assert workgroupsize !== nothing
+    @assert ndrange !== nothing
+    blocks, workgroupsize, dynamic = NDIteration.partition(ndrange, workgroupsize)
 
     if static_ndrange <: StaticSize
-        maxsize = prod(get(static_ndrange))
-    else
-        maxsize = prod(ndrange)
+        static_blocks = StaticSize{blocks}
+        blocks = nothing
+    else 
+        static_blocks = DynamicSize
+        blocks = CartesianIndices(blocks)
     end
 
-    nworkgroups = fld1(maxsize, workgroupsize)
-    dynamic     = mod(maxsize, workgroupsize) != 0
-
-    dynamic || @assert(nworkgroups * workgroupsize == maxsize)
+    if static_workgroupsize <: StaticSize
+        static_workgroupsize = StaticSize{workgroupsize} # we might have padded workgroupsize
+        workgroupsize = nothing
+    else
+        workgroupsize = CartesianIndices(workgroupsize)
+    end
 
-    return nworkgroups, dynamic 
+    iterspace = NDRange{length(ndrange), static_blocks, static_workgroupsize}(blocks, workgroupsize)
+    return iterspace, dynamic
 end
 
 ###
@@ -273,10 +254,7 @@ include("compiler.jl")
 # Compiler/Frontend
 ###
 
-@inline function __workitems_iterspace()
-    return 1:groupsize()
-end
-
+function __workitems_iterspace end
 function __validindex end
 
 include("macros.jl")
 
@@ -14,86 +14,90 @@ function wait(ev::CPUEvent, progress=nothing)
 end
 
 function (obj::Kernel{CPU})(args...; ndrange=nothing, workgroupsize=nothing, dependencies=nothing)
-    if ndrange isa Int
+    if ndrange isa Integer
         ndrange = (ndrange,)
     end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
     if dependencies isa Event
         dependencies = (dependencies,)
     end
+
     if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
-        workgroupsize = 1024 # Vectorization, 4x unrolling, minimal grain size
+        workgroupsize = (1024,) # Vectorization, 4x unrolling, minimal grain size
     end
-    nblocks, dynamic = partition(obj, ndrange, workgroupsize)
+    iterspace, dynamic = partition(obj, ndrange, workgroupsize)
     # partition checked that the ndrange's agreed
     if KernelAbstractions.ndrange(obj) <: StaticSize
         ndrange = nothing
     end
-    if KernelAbstractions.workgroupsize(obj) <: StaticSize
-        workgroupsize = nothing
-    end
-    t = Threads.@spawn begin
+
+    t = __run(obj, ndrange, iterspace, args, dependencies)
+    return CPUEvent(t)
+end
+
+# Inference barrier
+function __run(obj, ndrange, iterspace, args, dependencies)
+    return Threads.@spawn begin
         if dependencies !== nothing
             Base.sync_end(map(e->e.task, dependencies))
         end
         @sync begin
-            for I in 1:(nblocks-1)
-                let ctx = mkcontext(obj, I, ndrange, workgroupsize)
-                    Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
+            # TODO: how do we use the information that the iteration space maps perfectly to
+            #       the ndrange without incurring a 2x compilation overhead
+            # if dynamic
+                for block in iterspace
+                    let ctx = mkcontextdynamic(obj, block, ndrange, iterspace)
+                        Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
+                    end
                 end
-            end
-
-            if dynamic
-                let ctx = mkcontextdynamic(obj, nblocks, ndrange, workgroupsize)
-                    Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
-                end
-            else 
-                let ctx = mkcontext(obj, nblocks, ndrange, workgroupsize)
-                    Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
-                end
-            end
+            # else
+            #     for block in iterspace
+            #         let ctx = mkcontext(obj, blocks, ndrange, iterspace)
+            #             Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
+            #         end
+            #     end
+            # end
         end
     end
-    return CPUEvent(t)
 end
 
 Cassette.@context CPUCtx
 
-function mkcontext(kernel::Kernel{CPU}, I, _ndrange, _workgroupsize)
-    metadata = CompilerMetadata{workgroupsize(kernel), ndrange(kernel), false}(I, _ndrange, _workgroupsize)
+function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace)
+    metadata = CompilerMetadata{ndrange(kernel), false}(I, _ndrange, iterspace)
     Cassette.disablehooks(CPUCtx(pass = CompilerPass, metadata=metadata))
 end
 
-function mkcontextdynamic(kernel::Kernel{CPU}, I, _ndrange, _workgroupsize)
-    metadata = CompilerMetadata{workgroupsize(kernel), ndrange(kernel), true}(I, _ndrange, _workgroupsize)
+function mkcontextdynamic(kernel::Kernel{CPU}, I, _ndrange, iterspace)
+    metadata = CompilerMetadata{ndrange(kernel), true}(I, _ndrange, iterspace)
     Cassette.disablehooks(CPUCtx(pass = CompilerPass, metadata=metadata))
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Linear), idx)
-    return idx
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Linear), idx::CartesianIndex)
+    indices = workitems(__iterspace(ctx.metadata))
+    return @inbounds LinearIndices(indices)[idx]
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Linear), idx)
-    workgroup = __groupindex(ctx.metadata)
-    (workgroup - 1) * __groupsize(ctx.metadata) + idx
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Linear), idx::CartesianIndex)
+    I = @inbounds expand(__iterspace(ctx.metadata), __groupindex(ctx.metadata), idx)
+    @inbounds LinearIndices(__ndrange(ctx.metadata))[I]
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Cartesian), idx)
-    error("@index(Local, Cartesian) is not yet defined")
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Cartesian), idx::CartesianIndex)
+    return idx
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Cartesian), idx)
-    workgroup = __groupindex(ctx.metadata)
-    indices = __ndrange(ctx.metadata)
-    lI = (workgroup - 1) * __groupsize(ctx.metadata) + idx
-    return @inbounds indices[lI]
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Cartesian), idx::CartesianIndex)
+    return @inbounds expand(__iterspace(ctx.metadata), __groupindex(ctx.metadata), idx)
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__validindex), idx)
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__validindex), idx::CartesianIndex)
     # Turns this into a noop for code where we can turn of checkbounds of
     if __dynamic_checkbounds(ctx.metadata)
-        maxidx = prod(size(__ndrange(ctx.metadata)))
-        valid  = idx <= mod1(maxidx, __groupsize(ctx.metadata)) 
-        return valid
+        I = @inbounds expand(__iterspace(ctx.metadata), __groupindex(ctx.metadata), idx)
+        return I in __ndrange(ctx.metadata)
     else
         return true
     end