Skip to content

Commit 530f1c6

Browse files
authored
CUPTI: Add high-level wrappers for the callback API. (#2239)
1 parent 88ebe50 commit 530f1c6

File tree

4 files changed

+141
-91
lines changed

4 files changed

+141
-91
lines changed

lib/cupti/error.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ function description(err)
8888
elseif err.code == CUPTI_ERROR_CUDA_COMPILER_NOT_COMPATIBLE
8989
"Profiling results might be incorrect for CUDA applications compiled with nvcc version older than 9.0 for devices with compute capability 6.0 and 6.1"
9090
elseif err.code == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
91-
"user doesn't have sufficient privileges which are required to start the profiling session"
91+
"""Insufficient privileges: You don't have permissions to profile GPU code.
92+
Please configure your system to allow all users to profile, or run Julia with
93+
elevated permissions: https://developer.nvidia.com/ERR_NVGPUCTRPERM#SolnAdminTag"""
9294
elseif err.code == CUPTI_ERROR_OLD_PROFILER_API_INITIALIZED
9395
"old profiling api's are not supported with new profiling api's"
9496
elseif err.code == CUPTI_ERROR_OPENACC_UNDEFINED_ROUTINE

lib/cupti/wrappers.jl

Lines changed: 114 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,98 @@ function version()
55
end
66

77

8+
#
9+
# callback API
10+
#
11+
12+
# multiple subscribers aren't supported, so make sure we only call CUPTI once
13+
const callback_lock = ReentrantLock()
14+
15+
function callback(userdata::Ptr{Cvoid}, domain::CUpti_CallbackDomain,
16+
id::CUpti_CallbackId, data_ptr::Ptr{Cvoid})
17+
cfg = Base.unsafe_pointer_to_objref(userdata)::CallbackConfig
18+
19+
# decode the callback data
20+
datatype = if domain in (CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_CB_DOMAIN_RUNTIME_API)
21+
CUpti_CallbackData
22+
elseif domain == CUPTI_CB_DOMAIN_RESOURCE
23+
CUpti_ResourceData
24+
elseif domain == CUPTI_CB_DOMAIN_SYNCHRONIZE
25+
CUpti_SynchronizeData
26+
elseif domain == CUPTI_CB_DOMAIN_NVTX
27+
CUpti_NvtxData
28+
else
29+
@warn """Unsupported callback domain: $(domain).
30+
Please file an issue, or extend the implementation of `CUPTI.callback` to handle this callback kind."""
31+
return
32+
end
33+
data = unsafe_load(convert(Ptr{datatype}, data_ptr))
34+
35+
# invoke the actual user callback
36+
cfg.callback(domain, id, data)
37+
38+
return
39+
end
40+
41+
"""
42+
cfg = CUPTI.CallbackConfig(callback_kinds) do domain, id, data
43+
# inspect data
44+
end
45+
46+
CUPTI.enable!(cfg) do
47+
# do stuff
48+
end
49+
"""
50+
mutable struct CallbackConfig
51+
callback::Function
52+
callback_kinds::Vector{CUpti_CallbackDomain}
53+
end
54+
55+
function enable!(f::Base.Callable, cfg::CallbackConfig)
56+
@lock callback_lock begin
57+
callback_ptr =
58+
@cfunction(callback, Cvoid,
59+
(Ptr{Cvoid}, CUpti_CallbackDomain, CUpti_CallbackId, Ptr{Cvoid}))
60+
61+
GC.@preserve cfg begin
62+
# set-up subscriber
63+
subscriber_ref = Ref{CUpti_SubscriberHandle}()
64+
cuptiSubscribe(subscriber_ref, callback_ptr, Base.pointer_from_objref(cfg))
65+
subscriber = subscriber_ref[]
66+
67+
# enable domains
68+
for callback_kind in cfg.callback_kinds
69+
CUPTI.cuptiEnableDomain(true, subscriber, callback_kind)
70+
end
71+
72+
try
73+
f()
74+
finally
75+
# disable callback kinds
76+
for callback_kind in cfg.callback_kinds
77+
CUPTI.cuptiEnableDomain(false, subscriber, callback_kind)
78+
end
79+
80+
# disable the subscriber
81+
CUPTI.cuptiUnsubscribe(subscriber)
82+
end
83+
end
84+
end
85+
end
86+
87+
888
#
989
# activity API
1090
#
1191

1292
"""
13-
cfg = ActvitiyConfig(activity_kinds)
93+
cfg = CUPTI.ActivityConfig(activity_kinds)
1494
15-
enable!(cfg)
16-
# do stuff
17-
disable!(cfg)
95+
CUPTI.enable!(cfg) do
96+
# do stuff
97+
end
1898
19-
process(cfg) do (ctx, stream, record)
99+
CUPTI.process(cfg) do ctx, stream_id, record
20100
# inspect record
21101
end
22102
@@ -43,6 +123,7 @@ function allocate_buffer()
43123
Array{UInt8}(undef, 8 * 1024 * 1024) # 8 MB
44124
end
45125

126+
const activity_lock = ReentrantLock()
46127
const activity_config = Ref{Union{Nothing,ActivityConfig}}(nothing)
47128

48129
function request_buffer(dest_ptr, sz_ptr, max_num_records_ptr)
@@ -93,41 +174,40 @@ function complete_buffer(ctx_handle, stream_id, buf_ptr, sz, valid_sz)
93174
return
94175
end
95176

96-
function enable!(cfg::ActivityConfig)
97-
activity_config[] === nothing ||
98-
error("Only one profiling session can be active at a time.")
99-
100-
# set-up callbacks
101-
request_buffer_ptr = @cfunction(request_buffer, Cvoid,
102-
(Ptr{Ptr{UInt8}}, Ptr{Csize_t}, Ptr{Csize_t}))
103-
complete_buffer_ptr = @cfunction(complete_buffer, Cvoid,
104-
(CUDA.CUcontext, UInt32, Ptr{UInt8}, Csize_t, Csize_t))
105-
cuptiActivityRegisterCallbacks(request_buffer_ptr, complete_buffer_ptr)
177+
function enable!(f::Base.Callable, cfg::ActivityConfig)
178+
@lock activity_lock begin
179+
activity_config[] = cfg
106180

107-
activity_config[] = cfg
181+
# set-up callbacks
182+
request_buffer_ptr =
183+
@cfunction(request_buffer, Cvoid,
184+
(Ptr{Ptr{UInt8}}, Ptr{Csize_t}, Ptr{Csize_t}))
185+
complete_buffer_ptr =
186+
@cfunction(complete_buffer, Cvoid,
187+
(CUDA.CUcontext, UInt32, Ptr{UInt8}, Csize_t, Csize_t))
188+
cuptiActivityRegisterCallbacks(request_buffer_ptr, complete_buffer_ptr)
108189

109-
# enable requested activity kinds
110-
for activity_kind in cfg.activity_kinds
111-
cuptiActivityEnable(activity_kind)
112-
end
113-
end
190+
activity_config[] = cfg
114191

115-
function disable!(cfg::ActivityConfig)
116-
if activity_config[] !== cfg
117-
error("This profiling session is not active.")
118-
end
119-
120-
# disable activity kinds
121-
for activity_kind in cfg.activity_kinds
122-
cuptiActivityDisable(activity_kind)
123-
end
192+
# enable requested activity kinds
193+
for activity_kind in cfg.activity_kinds
194+
cuptiActivityEnable(activity_kind)
195+
end
124196

125-
# flush all activity records, even incomplete ones
126-
cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)
197+
try
198+
f()
199+
finally
200+
# disable activity kinds
201+
for activity_kind in cfg.activity_kinds
202+
cuptiActivityDisable(activity_kind)
203+
end
127204

128-
activity_config[] = nothing
205+
# flush all activity records, even incomplete ones
206+
cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)
129207

130-
return
208+
activity_config[] = nothing
209+
end
210+
end
131211
end
132212

133213
function process(f, cfg::ActivityConfig)

src/compiler/reflection.jl

Lines changed: 20 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,14 @@
11
# code reflection entry-points
22

33
using .CUPTI
4-
using .CUPTI: CUpti_CallbackDomain, CUpti_CallbackId, CUpti_SubscriberHandle,
5-
CUpti_ResourceData, CUpti_ModuleResourceData
4+
using .CUPTI: CUpti_ModuleResourceData
65

76

87

98
#
109
# code_* replacements
1110
#
1211

13-
function code_sass_callback(userdata::Ptr{Cvoid}, domain::CUpti_CallbackDomain,
14-
cbid::CUpti_CallbackId, cbdada::Ptr{Cvoid})
15-
dest = Base.unsafe_pointer_to_objref(userdata)::Ref{Any}
16-
17-
if domain == CUPTI.CUPTI_CB_DOMAIN_RESOURCE
18-
cbdada = unsafe_load(reinterpret(Ptr{CUpti_ResourceData}, cbdada))
19-
if cbid == CUPTI.CUPTI_CBID_RESOURCE_MODULE_LOADED
20-
resourceDescriptor =
21-
unsafe_load(reinterpret(Ptr{CUpti_ModuleResourceData}, cbdada.resourceDescriptor))
22-
cubin = unsafe_wrap(Vector{Cchar}, pointer(resourceDescriptor.pCubin),
23-
resourceDescriptor.cubinSize)
24-
dest[] = copy(cubin)
25-
end
26-
end
27-
28-
return
29-
end
30-
3112
"""
3213
code_sass([io], f, types; raw=false)
3314
@@ -49,8 +30,8 @@ function code_sass(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
4930
code_sass(io, job; kwargs...)
5031
end
5132

52-
# multiple subscribers aren't supported, so make sure we only call CUPTI once
53-
const cupti_lock = ReentrantLock()
33+
code_sass(@nospecialize(func), @nospecialize(types); kwargs...) =
34+
code_sass(stdout, func, types; kwargs...)
5435

5536
function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
5637
if !job.config.kernel
@@ -64,35 +45,27 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
6445
return
6546
end
6647

67-
compiled = compile(job)
48+
cfg = CUPTI.CallbackConfig([CUPTI.CUPTI_CB_DOMAIN_RESOURCE]) do domain, id, data
49+
id == CUPTI.CUPTI_CBID_RESOURCE_MODULE_LOADED || return
50+
resourceDescriptor =
51+
unsafe_load(convert(Ptr{CUpti_ModuleResourceData}, data.resourceDescriptor))
52+
cubin = unsafe_wrap(Vector{Cchar}, pointer(resourceDescriptor.pCubin),
53+
resourceDescriptor.cubinSize)
54+
disassemble_cubin(io, cubin; raw)
55+
end
6856

69-
cubin = Ref{Any}()
70-
callback = @cfunction(code_sass_callback, Cvoid,
71-
(Ptr{Cvoid}, CUpti_CallbackDomain, CUpti_CallbackId, Ptr{Cvoid}))
72-
73-
# JIT compile and capture the generated object file
74-
lock(cupti_lock) do
75-
subscriber_ref = Ref{CUpti_SubscriberHandle}()
76-
res = CUPTI.unsafe_cuptiSubscribe(subscriber_ref, callback, Base.pointer_from_objref(cubin))
77-
if res === CUPTI.CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
78-
error("""Insufficient privilege: You don't have permissions to profile GPU code, which is required for `code_sass`.
79-
Get administrative privileges or allow all users to profile: https://developer.nvidia.com/ERR_NVGPUCTRPERM#SolnAdminTag""")
80-
elseif res != CUPTI.CUPTI_SUCCESS
81-
throw(CUPTIError(res))
82-
end
83-
subscriber = subscriber_ref[]
84-
try
85-
CUPTI.cuptiEnableDomain(1, subscriber, CUPTI.CUPTI_CB_DOMAIN_RESOURCE)
86-
link(job, compiled)
87-
finally
88-
CUPTI.cuptiUnsubscribe(subscriber)
89-
end
57+
compiled = compile(job)
58+
CUPTI.enable!(cfg) do
59+
link(job, compiled)
9060
end
9161

92-
# disassemble to SASS
93-
isassigned(cubin) || error("No kernels compiled")
62+
return
63+
end
64+
65+
# disassemble a cubin to SASS
66+
function disassemble_cubin(io::IO, cubin::Vector{Cchar}; raw::Bool)
9467
mktemp() do cubin_path,cubin_io
95-
write(cubin_io, cubin[])
68+
write(cubin_io, cubin)
9669
flush(cubin_io)
9770

9871
cmd = `$(nvdisasm()) --print-code --print-line-info $cubin_path`
@@ -111,9 +84,6 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
11184
end
11285
end
11386

114-
code_sass(@nospecialize(func), @nospecialize(types); kwargs...) =
115-
code_sass(stdout, func, types; kwargs...)
116-
11787

11888
# forward the rest to GPUCompiler with an appropriate CompilerJob
11989

src/profile.jl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,18 +270,16 @@ function profile_internally(f; concurrent=true, kwargs...)
270270
GC.gc(false)
271271
GC.gc(true)
272272

273-
CUPTI.enable!(cfg)
273+
CUPTI.enable!(cfg) do
274+
# sink the initial profiler overhead into a synchronization call
275+
CUDA.cuCtxSynchronize()
274276

275-
# sink the initial profiler overhead into a synchronization call
276-
CUDA.cuCtxSynchronize()
277-
try
278277
f()
279278

280279
# synchronize to ensure we capture all activity
281280
CUDA.cuCtxSynchronize()
282-
finally
283-
CUPTI.disable!(cfg)
284281
end
282+
285283
data = capture(cfg)
286284
ProfileResults(; data..., kwargs...)
287285
end

0 commit comments

Comments
 (0)