Merge pull request #16 from JuliaGPU/vc/const

vchuravy · web-flow · commit 7b2dc948d47f · 2020-02-13T15:26:28.000-05:00
implement Const memory for GPU and CPU
diff --git a/docs/src/kernels.md b/docs/src/kernels.md
@@ -1,11 +1,16 @@
 # Writing kernels 
 
 These kernel language constructs are intended to be used as part
-of [`@kernel`](@ref) functions and not outside that context.
+of [`@kernel`](@ref) functions and not valid outside that context.
 
 ## Constant arguments
 
-[`@Const`](@ref)
+Kernel functions allow for input arguments to be marked with the
+[`@Const`](@ref) macro. It informs the compiler that the memory
+accessed through that marked input argument, will not be written
+to as part of the kernel. This has the implication that input arguments
+are **not** allowed to alias each other. If you are used to CUDA C this
+is similar to `const restrict`.
 
 ## Indexing
 
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -7,6 +7,7 @@ export Device, GPU, CPU, CUDA
 using StaticArrays
 using Cassette
 using Requires
+using Adapt
 
 """
    @kernel function f(args) end
@@ -147,6 +148,13 @@ function __index_Global_Linear end
 
 function __index_Local_Cartesian end
 function __index_Global_Cartesian end
+
+struct ConstAdaptor end
+
+Adapt.adapt_storage(to::ConstAdaptor, a::Array) = Base.Experimental.Const(a)
+
+constify(arg) = adapt(ConstAdaptor(), arg)
+
 ###
 # Backend hierarchy
 ###
@@ -271,8 +279,6 @@ end
 
 function __validindex end
 
-# TODO: GPU ConstWrapper that forwards loads to `ldg` and forbids stores
-ConstWrapper(A) = A
 include("macros.jl")
 
 ###
diff --git a/src/backends/cuda.jl b/src/backends/cuda.jl
@@ -1,5 +1,5 @@
 import CUDAnative, CUDAdrv
-import CUDAnative: cufunction
+import CUDAnative: cufunction, DevicePtr
 import CUDAdrv: CuEvent, CuStream, CuDefaultStream
 
 const FREE_STREAMS = CuStream[]
@@ -218,3 +218,32 @@ end
 @inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__synchronize))
     CUDAnative.sync_threads()
 end
+
+###
+# GPU implementation of `@Const`
+###
+struct ConstCuDeviceArray{T,N,A} <: AbstractArray{T,N}
+    shape::Dims{N}
+    ptr::DevicePtr{T,A}
+
+    # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
+    ConstCuDeviceArray{T,N,A}(shape::Dims{N}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr)
+end
+
+Adapt.adapt_storage(to::ConstAdaptor, a::CUDAnative.CuDeviceArray{T,N,A}) where {T,N,A} = ConstCuDeviceArray{T, N, A}(a.shape, a.ptr)
+
+Base.pointer(a::ConstCuDeviceArray) = a.ptr
+Base.pointer(a::ConstCuDeviceArray, i::Integer) =
+    pointer(a) + (i - 1) * Base.elsize(a)
+
+Base.elsize(::Type{<:ConstCuDeviceArray{T}}) where {T} = sizeof(T)
+Base.size(g::ConstCuDeviceArray) = g.shape
+Base.length(g::ConstCuDeviceArray) = prod(g.shape)
+
+Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::ConstCuDeviceArray{T,N,A}) where {T,A,N} = pointer(a)
+
+@inline function Base.getindex(A::ConstCuDeviceArray{T}, index::Integer) where {T}
+    @boundscheck checkbounds(A, index)
+    align = Base.datatype_alignment(T)
+    CUDAnative.unsafe_cached_load(pointer(A), index, Val(align))::T
+end
diff --git a/src/macros.jl b/src/macros.jl
@@ -66,7 +66,7 @@ function transform_gpu(expr, args)
     new_stmts = Expr[]
     for (arg, isconst) in args
         if isconst
-            push!(new_stmts, :($arg = $ConstWrapper($arg)))
+            push!(new_stmts, :($arg = $constify($arg)))
         end
     end
     return quote
@@ -148,16 +148,15 @@ function transform_cpu(stmts, args)
     new_stmts = Expr[]
     for (arg, isconst) in args
         if isconst
-            # XXX: Deal with OffsetArrays
-            push!(new_stmts, :($arg = $Base.Experimental.Const($arg)))
+            push!(new_stmts, :($arg = $constify($arg)))
         end
     end
     loops = split(stmts)
     body  = generate_cpu_code(loops) 
 
-    # push!(new_stmts, Expr(:aliasscope))
+    push!(new_stmts, Expr(:aliasscope))
     push!(new_stmts, body)
-    # push!(new_stmts, Expr(:popaliasscope))
+    push!(new_stmts, Expr(:popaliasscope))
     push!(new_stmts, :(return nothing))
     return Expr(:block, new_stmts...)
 end
diff --git a/test/test.jl b/test/test.jl
@@ -1,7 +1,9 @@
 using KernelAbstractions
 using CUDAapi
+using InteractiveUtils
 if has_cuda_gpu()
     using CuArrays
+    using CUDAnative
     CuArrays.allowscalar(false)
 end
 
@@ -131,4 +133,38 @@ end
     if has_cuda_gpu()
         indextest(CUDA(), CuArray)
     end
+end
+
+@kernel function constarg(A, @Const(B))
+    I = @index(Global)
+    @inbounds A[I] = B[I]
+end
+
+@testset "Const" begin
+    let kernel = constarg(CPU(), 8, (1024,))
+        # this is poking at internals
+        ctx = KernelAbstractions.mkcontext(kernel, 1, nothing, nothing)
+        AT = Array{Float32, 2}
+        IR = sprint() do io
+            code_llvm(io, KernelAbstractions.Cassette.overdub, 
+                     (typeof(ctx), typeof(kernel.f), AT, AT),
+                     optimize=false, raw=true)
+        end
+        @test occursin("!alias.scope", IR)
+        @test occursin("!noalias", IR)
+    end
+
+    if has_cuda_gpu()
+        let kernel = constarg(CUDA(), 8, (1024,))
+            # this is poking at internals
+            ctx = KernelAbstractions.mkcontext(kernel, nothing)
+            AT = CUDAnative.CuDeviceArray{Float32, 2, CUDAnative.AS.Global}
+            IR = sprint() do io
+                CUDAnative.code_llvm(io, KernelAbstractions.Cassette.overdub, 
+                        (typeof(ctx), typeof(kernel.f), AT, AT),
+                        kernel=true, optimize=false)
+            end
+            @test occursin("@llvm.nvvm.ldg", IR)
+        end
+    end
 end