Merge pull request #801 from JuliaGPU/tb/improvements

maleadt · web-flow · commit 4721f60cdd4a · 2021-04-02T12:01:37.000+02:00
Various improvements
diff --git a/Manifest.toml b/Manifest.toml
@@ -83,7 +83,9 @@ version = "6.2.0"
 
 [[GPUCompiler]]
 deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "0bae2d5a40c54c6c680a1a223f83a3c415ec730b"
+git-tree-sha1 = "386f3a455ebd4ad9a0b752715977ba70a59462ab"
+repo-rev = "36ddc724add5835fd901d4bf1eca929865cf6dd7"
+repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 version = "0.11.0"
 
diff --git a/deps/compatibility.jl b/deps/compatibility.jl
@@ -178,10 +178,7 @@ function llvm_compat(version=LLVM.version())
     InitializeNVPTXTarget()
 
     cap_support = sort(collect(llvm_cap_support(version)))
-
-    ptx_support = llvm_ptx_support(version)
-    push!(ptx_support, v"6.0") # JuliaLang/julia#23817
-    ptx_support = sort(collect(ptx_support))
+    ptx_support = sort(collect(llvm_ptx_support(version)))
 
     return (cap=cap_support, ptx=ptx_support)
 end
diff --git a/src/compiler/gpucompiler.jl b/src/compiler/gpucompiler.jl
@@ -1,5 +1,6 @@
 function CUDACompilerTarget(dev::CuDevice; kwargs...)
     cap = supported_capability(dev)
+    ptx = v"6.3"    # we only need 6.2, but NVPTX doesn't support that
 
     exitable = true
     if cap < v"7"
@@ -16,7 +17,7 @@ function CUDACompilerTarget(dev::CuDevice; kwargs...)
 
     debuginfo = false
 
-    PTXCompilerTarget(; cap, exitable, debuginfo, kwargs...)
+    PTXCompilerTarget(; cap, ptx, exitable, debuginfo, kwargs...)
 end
 
 struct CUDACompilerParams <: AbstractCompilerParams end
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
@@ -1,8 +1,7 @@
 # Indexing and dimensions (B.4)
 
 export
-    threadIdx, blockDim, blockIdx, gridDim,
-    warpsize
+    threadIdx, blockDim, blockIdx, gridDim, laneid, warpsize, active_mask, FULL_MASK
 
 @generated function _index(::Val{name}, ::Val{range}) where {name, range}
     JuliaContext() do ctx
@@ -96,3 +95,20 @@ Returns the thread index within the block.
 Returns the warp size (in threads).
 """
 @inline warpsize() = Int(ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, UInt32, ()))
+
+"""
+    laneid()::UInt32
+
+Returns the thread's lane within the warp.
+"""
+@inline laneid() = Int(ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, UInt32, ()))+UInt32(1)
+
+"""
+    active_mask()
+
+Returns a 32-bit mask indicating which threads in a warp are active with the current
+executing thread.
+"""
+@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
+
+const FULL_MASK = 0xffffffff
diff --git a/src/device/intrinsics/warp_shuffle.jl b/src/device/intrinsics/warp_shuffle.jl
@@ -1,16 +1,12 @@
 # Warp Shuffle (B.14)
 
-export FULL_MASK
+# TODO: this functionality should throw <sm_30
 
 # TODO: does not work on sub-word (ie. Int16) or non-word divisible sized types
 
 # TODO: these functions should dispatch based on the actual warp size
 const ws = Int32(32)
 
-const FULL_MASK = 0xffffffff
-
-# TODO: this functionality should throw <sm_30
-
 
 # core intrinsics
 
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
@@ -14,6 +14,10 @@
     @on_device blockIdx().z
     @on_device gridDim().z
 
+    @on_device warpsize()
+    @on_device laneid()
+    @on_device active_mask()
+
     @testset "range metadata" begin
         foobar() = threadIdx().x
         ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true))