Skip to content

Commit 4721f60

Browse files
authored
Merge pull request #801 from JuliaGPU/tb/improvements
Various improvements
2 parents 87b1f4a + 44a7951 commit 4721f60

File tree

6 files changed

+29
-13
lines changed

6 files changed

+29
-13
lines changed

Manifest.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ version = "6.2.0"
8383

8484
[[GPUCompiler]]
8585
deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
86-
git-tree-sha1 = "0bae2d5a40c54c6c680a1a223f83a3c415ec730b"
86+
git-tree-sha1 = "386f3a455ebd4ad9a0b752715977ba70a59462ab"
87+
repo-rev = "36ddc724add5835fd901d4bf1eca929865cf6dd7"
88+
repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git"
8789
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
8890
version = "0.11.0"
8991

deps/compatibility.jl

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,10 +178,7 @@ function llvm_compat(version=LLVM.version())
178178
InitializeNVPTXTarget()
179179

180180
cap_support = sort(collect(llvm_cap_support(version)))
181-
182-
ptx_support = llvm_ptx_support(version)
183-
push!(ptx_support, v"6.0") # JuliaLang/julia#23817
184-
ptx_support = sort(collect(ptx_support))
181+
ptx_support = sort(collect(llvm_ptx_support(version)))
185182

186183
return (cap=cap_support, ptx=ptx_support)
187184
end

src/compiler/gpucompiler.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
function CUDACompilerTarget(dev::CuDevice; kwargs...)
22
cap = supported_capability(dev)
3+
ptx = v"6.3" # we only need 6.2, but NVPTX doesn't support that
34

45
exitable = true
56
if cap < v"7"
@@ -16,7 +17,7 @@ function CUDACompilerTarget(dev::CuDevice; kwargs...)
1617

1718
debuginfo = false
1819

19-
PTXCompilerTarget(; cap, exitable, debuginfo, kwargs...)
20+
PTXCompilerTarget(; cap, ptx, exitable, debuginfo, kwargs...)
2021
end
2122

2223
struct CUDACompilerParams <: AbstractCompilerParams end

src/device/intrinsics/indexing.jl

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
# Indexing and dimensions (B.4)
22

33
export
4-
threadIdx, blockDim, blockIdx, gridDim,
5-
warpsize
4+
threadIdx, blockDim, blockIdx, gridDim, laneid, warpsize, active_mask, FULL_MASK
65

76
@generated function _index(::Val{name}, ::Val{range}) where {name, range}
87
JuliaContext() do ctx
@@ -96,3 +95,20 @@ Returns the thread index within the block.
9695
Returns the warp size (in threads).
9796
"""
9897
@inline warpsize() = Int(ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, UInt32, ()))
98+
99+
"""
100+
laneid()::UInt32
101+
102+
Returns the thread's lane within the warp.
103+
"""
104+
@inline laneid() = Int(ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, UInt32, ()))+UInt32(1)
105+
106+
"""
107+
active_mask()
108+
109+
Returns a 32-bit mask indicating which threads in a warp are active with the current
110+
executing thread.
111+
"""
112+
@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
113+
114+
const FULL_MASK = 0xffffffff

src/device/intrinsics/warp_shuffle.jl

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
11
# Warp Shuffle (B.14)
22

3-
export FULL_MASK
3+
# TODO: this functionality should throw <sm_30
44

55
# TODO: does not work on sub-word (ie. Int16) or non-word divisible sized types
66

77
# TODO: these functions should dispatch based on the actual warp size
88
const ws = Int32(32)
99

10-
const FULL_MASK = 0xffffffff
11-
12-
# TODO: this functionality should throw <sm_30
13-
1410

1511
# core intrinsics
1612

test/device/intrinsics.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
@on_device blockIdx().z
1515
@on_device gridDim().z
1616

17+
@on_device warpsize()
18+
@on_device laneid()
19+
@on_device active_mask()
20+
1721
@testset "range metadata" begin
1822
foobar() = threadIdx().x
1923
ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true))

0 commit comments

Comments
 (0)