Skip to content

Commit 5af1a7a

Browse files
authored
Add support for NVTX events to the integrated profiler. (#2043)
1 parent 6ad7a8c commit 5af1a7a

File tree

8 files changed

+200
-32
lines changed

8 files changed

+200
-32
lines changed

Manifest.toml

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
julia_version = "1.8.5"
44
manifest_format = "2.0"
5-
project_hash = "56ddf5ba3ee0cbfd2fc5e239e07860c2adb9b489"
5+
project_hash = "62b8680a215cbfae7c33bbe0a528c176464c0f65"
66

77
[[deps.AbstractFFTs]]
88
deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
@@ -73,6 +73,18 @@ git-tree-sha1 = "2fba81a302a7be671aefe194f0525ef231104e7f"
7373
uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
7474
version = "0.1.8"
7575

76+
[[deps.ColorTypes]]
77+
deps = ["FixedPointNumbers", "Random"]
78+
git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4"
79+
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
80+
version = "0.11.4"
81+
82+
[[deps.Colors]]
83+
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
84+
git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a"
85+
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
86+
version = "0.12.10"
87+
7688
[[deps.Compat]]
7789
deps = ["Dates", "LinearAlgebra", "UUIDs"]
7890
git-tree-sha1 = "e460f044ca8b99be31d35fe54fc33a5c33dd8ed7"
@@ -134,6 +146,12 @@ version = "0.1.10"
134146
[[deps.FileWatching]]
135147
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
136148

149+
[[deps.FixedPointNumbers]]
150+
deps = ["Statistics"]
151+
git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
152+
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
153+
version = "0.8.4"
154+
137155
[[deps.Future]]
138156
deps = ["Random"]
139157
uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
@@ -193,6 +211,12 @@ git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
193211
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
194212
version = "1.5.0"
195213

214+
[[deps.JuliaNVTXCallbacks_jll]]
215+
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
216+
git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
217+
uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
218+
version = "0.2.1+0"
219+
196220
[[deps.KernelAbstractions]]
197221
deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
198222
git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
@@ -280,6 +304,18 @@ version = "1.1.0"
280304
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
281305
version = "2022.2.1"
282306

307+
[[deps.NVTX]]
308+
deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
309+
git-tree-sha1 = "ab760fa11c7f12df87334ea9e0dffc54af3025bc"
310+
uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
311+
version = "0.3.2"
312+
313+
[[deps.NVTX_jll]]
314+
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
315+
git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
316+
uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
317+
version = "3.1.0+2"
318+
283319
[[deps.NetworkOptions]]
284320
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
285321
version = "1.2.0"

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
2121
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
2222
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
2323
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
24+
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
2425
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
2526
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
2627
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -49,6 +50,7 @@ GPUArrays = "8.6"
4950
GPUCompiler = "0.22"
5051
KernelAbstractions = "0.9.2"
5152
LLVM = "6"
53+
NVTX = "0.3.2"
5254
Preferences = "1"
5355
PrettyTables = "2"
5456
Random123 = "1.2"

lib/cupti/wrappers.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ function process(f, cfg::ActivityConfig)
135135
CUPTI_ACTIVITY_KIND_DRIVER => CUpti_ActivityAPI,
136136
CUPTI_ACTIVITY_KIND_RUNTIME => CUpti_ActivityAPI,
137137
CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API => CUpti_ActivityAPI,
138+
CUPTI_ACTIVITY_KIND_NAME => CUpti_ActivityName,
139+
CUPTI_ACTIVITY_KIND_MARKER => CUpti_ActivityMarker2,
140+
CUPTI_ACTIVITY_KIND_MARKER_DATA => CUpti_ActivityMarkerData,
138141
)
139142
# NOTE: the CUPTI version is unreliable, e.g., both CUDA 11.5 and 11.6 have CUPTI 16,
140143
# yet CUpti_ActivityMemset4 is only available in CUDA 11.6.

src/CUDA.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ import Preferences
3434

3535
using Libdl
3636

37+
import NVTX
38+
3739

3840
## source code includes
3941

src/initialization.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ function __init__()
119119
It is recommended to upgrade your driver, or switch to automatic installation of CUDA."""
120120
end
121121

122+
# if we're not running under an external profiler, let CUPTI handle NVTX events
123+
if !NVTX.isactive()
124+
ENV["NVTX_INJECTION64_PATH"] = CUDA_Runtime.libcupti
125+
NVTX.activate()
126+
end
127+
122128
# finally, initialize CUDA
123129
try
124130
cuInit(0)

src/profile.jl

Lines changed: 123 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Profiler control
22

33
"""
4-
@profile [io=stdout] [host=true] [device=true] [trace=false] [raw=false] code...
4+
@profile [io=stdout] [trace=false] [raw=false] code...
55
@profile external=true code...
66
77
Profile the GPU execution of `code`.
@@ -11,22 +11,19 @@ There are two modes of operation, depending on whether `external` is `true` or `
1111
## Integrated profiler (`external=false`, the default)
1212
1313
In this mode, CUDA.jl will profile the execution of `code` and display the result. By
14-
default, both host-side and device-side activity is captured; this can be controlled with
15-
the `host` and `device` keyword arguments. If `trace` is `true`, a chronological trace of
16-
the captured activity will be generated, where the ID column can be used to match host-side
17-
and device-side activity; by default, only a summary will be shown. If `raw` is `true`, all
18-
data will always be included, even if it may not be relevant. The output will be written to
19-
`io`, which defaults to `stdout`.
14+
default, a summary of host and device-side execution will be show, including any NVTX
15+
events. To display a chronological trace of the captured activity instead, `trace` can be
16+
set to `true`. Trace output will include an ID column that can be used to match host-side
17+
and device-side activity. If `raw` is `true`, all data will always be included, even if it
18+
may not be relevant. The output will be written to `io`, which defaults to `stdout`.
2019
2120
Slow operations will be highlighted in the output: Entries colored in yellow are among the
2221
slowest 25%, while entries colored in red are among the slowest 5% of all operations.
2322
24-
!!! compat "Julia 1.9"
25-
This functionality is only available on Julia 1.9 and later.
23+
!!! compat "Julia 1.9" This functionality is only available on Julia 1.9 and later.
2624
27-
!!! compat "CUDA 11.2"
28-
Older versions of CUDA, before 11.2, contain bugs that may prevent the
29-
`CUDA.@profile` macro to work. It is recommended to use a newer runtime.
25+
!!! compat "CUDA 11.2" Older versions of CUDA, before 11.2, contain bugs that may prevent
26+
the `CUDA.@profile` macro to work. It is recommended to use a newer runtime.
3027
3128
## External profilers (`external=true`)
3229
@@ -187,6 +184,8 @@ function emit_integrated_profile(code, kwargs)
187184
# memory operations
188185
CUPTI.CUPTI_ACTIVITY_KIND_MEMCPY,
189186
CUPTI.CUPTI_ACTIVITY_KIND_MEMSET,
187+
# NVTX markers
188+
CUPTI.CUPTI_ACTIVITY_KIND_MARKER,
190189
]
191190
if CUDA.runtime_version() >= v"11.2"
192191
# additional information for API host calls
@@ -260,6 +259,14 @@ function generate_traces(cfg)
260259
id = Int[],
261260
details = String[],
262261
)
262+
nvtx_trace = DataFrame(
263+
id = Int[],
264+
start = Float64[],
265+
type = Symbol[],
266+
tid = Int[],
267+
name = Union{Missing,String}[],
268+
domain = Union{Missing,String}[],
269+
)
263270

264271
# memory_kind fields are sometimes typed CUpti_ActivityMemoryKind, sometimes UInt
265272
as_memory_kind(x) = isa(x, CUPTI.CUpti_ActivityMemoryKind) ? x : CUPTI.CUpti_ActivityMemoryKind(x)
@@ -349,18 +356,44 @@ function generate_traces(cfg)
349356
stream=record.streamId,
350357
grid, block, registers,
351358
static_shmem, dynamic_shmem); cols=:union)
359+
360+
# NVTX markers
361+
elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MARKER
362+
start = record.timestamp/1e9
363+
name = record.name == C_NULL ? missing : unsafe_string(record.name)
364+
domain = record.domain == C_NULL ? missing : unsafe_string(record.domain)
365+
366+
if record.flags == CUPTI.CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS
367+
@assert record.objectKind == CUDA.CUPTI.CUPTI_ACTIVITY_OBJECT_THREAD
368+
tid = record.objectId.pt.threadId
369+
push!(nvtx_trace, (; record.id, start, tid, type=:instant, name, domain))
370+
elseif record.flags == CUPTI.CUPTI_ACTIVITY_FLAG_MARKER_START
371+
@assert record.objectKind == CUDA.CUPTI.CUPTI_ACTIVITY_OBJECT_THREAD
372+
tid = record.objectId.pt.threadId
373+
push!(nvtx_trace, (; record.id, start, tid, type=:start, name, domain))
374+
elseif record.flags == CUPTI.CUPTI_ACTIVITY_FLAG_MARKER_END
375+
@assert record.objectKind == CUDA.CUPTI.CUPTI_ACTIVITY_OBJECT_THREAD
376+
tid = record.objectId.pt.threadId
377+
push!(nvtx_trace, (; record.id, start, tid, type=:end, name, domain))
378+
else
379+
@error "Unexpected NVTX marker kind $(Int(record.flags)). Please file an issue."
380+
end
352381
else
353-
error("Unexpected CUPTI activity kind: $(record.kind). Please file an issue.")
382+
@error "Unexpected CUPTI activity kind $(Int(record.kind)). Please file an issue."
354383
end
355384
end
356385

357-
return host_trace, device_trace, details
386+
# merge in the details
387+
host_trace = leftjoin(host_trace, details, on=:id, order=:left)
388+
device_trace = leftjoin(device_trace, details, on=:id, order=:left)
389+
390+
return host_trace, device_trace, nvtx_trace
358391
end
359392

360393
# render traces to a table
361-
function render_traces(host_trace, device_trace, details;
394+
function render_traces(host_trace, device_trace, nvtx_trace;
362395
io=stdout isa Base.TTY ? IOContext(stdout, :limit => true) : stdout,
363-
host=true, device=true, trace=false, raw=false)
396+
trace=false, raw=false)
364397
# find the relevant part of the trace (marked by calls to 'cuCtxSynchronize')
365398
trace_first_sync = findfirst(host_trace.name .== "cuCtxSynchronize")
366399
trace_first_sync === nothing && error("Could not find the start of the profiling trace.")
@@ -400,16 +433,21 @@ function render_traces(host_trace, device_trace, details;
400433
df.start .-= trace_begin
401434
df.stop .-= trace_begin
402435
end
436+
nvtx_trace.start .-= trace_begin
403437
if !raw
404438
# renumber event IDs from 1
405-
first_id = minimum([host_trace.id; device_trace.id; details.id])
406-
for df in (host_trace, device_trace, details)
439+
first_id = minimum([host_trace.id; device_trace.id])
440+
for df in (host_trace, device_trace)
407441
df.id .-= first_id - 1
408442
end
409443

410444
# renumber thread IDs from 1
411-
first_tid = minimum(host_trace.tid)
412-
host_trace.tid .-= first_tid - 1
445+
threads = unique([host_trace.tid; nvtx_trace.tid])
446+
for df in (host_trace, nvtx_trace)
447+
broadcast!(df.tid, df.tid) do tid
448+
findfirst(isequal(tid), threads)
449+
end
450+
end
413451

414452
end
415453

@@ -480,7 +518,7 @@ function render_traces(host_trace, device_trace, details;
480518
"name" => "Name"
481519
)
482520

483-
summary_formatter = function(v, i, j)
521+
summary_formatter(df) = function(v, i, j)
484522
if names(df)[j] == "time_ratio"
485523
format_percentage(v)
486524
elseif names(df)[j] in ["time", "time_avg", "time_min", "time_max"]
@@ -497,11 +535,11 @@ function render_traces(host_trace, device_trace, details;
497535
:horizontal
498536
end
499537

500-
if host
538+
# host-side activity
539+
let
501540
# to determine the time the host was active, we should look at threads separately
502541
host_time = maximum(combine(groupby(host_trace, :tid), :time => sum => :time).time)
503542
host_ratio = host_time / trace_time
504-
println(io, "\nHost-side activity: calling CUDA APIs took $(format_time(host_time)) ($(format_percentage(host_ratio)) of the trace)")
505543

506544
# get rid of API call version suffixes
507545
host_trace.name = replace.(host_trace.name, r"_v\d+$" => "")
@@ -524,11 +562,28 @@ function render_traces(host_trace, device_trace, details;
524562
end
525563
end
526564

527-
# add in details
528-
df = leftjoin(df, details, on=:id, order=:left)
565+
# instantaneous NVTX markers can be added to the API trace
566+
if trace
567+
markers = copy(nvtx_trace[nvtx_trace.type .== :instant, :])
568+
markers.id .= missing
569+
markers.time .= 0.0
570+
markers.details = map(markers.name, markers.domain) do name, domain
571+
if name !== missing && domain !== missing
572+
"$(domain).$(name)"
573+
elseif name !== missing
574+
"$name"
575+
end
576+
end
577+
markers.name .= "NVTX marker"
578+
append!(df, markers; cols=:subset)
579+
sort!(df, :start)
580+
end
529581

582+
if !isempty(df)
583+
println(io, "\nHost-side activity: calling CUDA APIs took $(format_time(host_time)) ($(format_percentage(host_ratio)) of the trace)")
584+
end
530585
if isempty(df)
531-
println(io, "No host-side activity was recorded.")
586+
println(io, "\nNo host-side activity was recorded.")
532587
elseif trace
533588
# determine columns to show, based on whether they contain useful information
534589
columns = [:id, :start, :time]
@@ -566,20 +621,23 @@ function render_traces(host_trace, device_trace, details;
566621
header = [summary_column_names[name] for name in names(df)]
567622
alignment = [i == lastindex(header) ? :l : :r for i in 1:length(header)]
568623
highlighters = time_highlighters(df)
569-
pretty_table(io, df; header, alignment, formatters=summary_formatter, highlighters, crop)
624+
pretty_table(io, df; header, alignment, formatters=summary_formatter(df), highlighters, crop)
570625
end
571626
end
572627

573-
if device
628+
# device-side activity
629+
let
574630
device_time = sum(device_trace.time)
575631
device_ratio = device_time / trace_time
576-
println(io, "\nDevice-side activity: GPU was busy for $(format_time(device_time)) ($(format_percentage(device_ratio)) of the trace)")
632+
if !isempty(device_trace)
633+
println(io, "\nDevice-side activity: GPU was busy for $(format_time(device_time)) ($(format_percentage(device_ratio)) of the trace)")
634+
end
577635

578636
# add memory throughput information
579637
device_trace.throughput = device_trace.size ./ device_trace.time
580638

581639
if isempty(device_trace)
582-
println(io, "No device-side activity was recorded.")
640+
println(io, "\nNo device-side activity was recorded.")
583641
elseif trace
584642
# determine columns to show, based on whether they contain useful information
585643
columns = [:id, :start, :time]
@@ -645,9 +703,43 @@ function render_traces(host_trace, device_trace, details;
645703
header = [summary_column_names[name] for name in names(df)]
646704
alignment = [i == lastindex(header) ? :l : :r for i in 1:length(header)]
647705
highlighters = time_highlighters(df)
648-
pretty_table(io, df; header, alignment, formatters=summary_formatter, highlighters, crop)
706+
pretty_table(io, df; header, alignment, formatters=summary_formatter(df), highlighters, crop)
649707
end
650708
end
709+
710+
# show NVTX ranges
711+
# TODO: do we also want to repeat the host/device summary for each NVTX range?
712+
# that's what nvprof used to do, but it's a little verbose...
713+
nvtx_ranges = copy(nvtx_trace[nvtx_trace.type .== :start, :])
714+
nvtx_ranges = leftjoin(nvtx_ranges, nvtx_trace[nvtx_trace.type .== :end,
715+
[:id, :start]],
716+
on=:id, makeunique=true)
717+
if !isempty(nvtx_ranges)
718+
println(io, "\nNVTX ranges:")
719+
720+
rename!(nvtx_ranges, :start_1 => :stop)
721+
nvtx_ranges.id .= missing
722+
nvtx_ranges.time .= nvtx_ranges.stop .- nvtx_ranges.start
723+
nvtx_ranges.name = map(nvtx_ranges.name, nvtx_ranges.domain) do name, domain
724+
if name !== missing && domain !== missing
725+
"$(domain).$(name)"
726+
elseif name !== missing
727+
"$name"
728+
end
729+
end
730+
731+
df = summarize_trace(nvtx_ranges)
732+
733+
columns = [:time_ratio, :time, :calls, :time_avg, :time_min, :time_max, :name]
734+
df = df[:, columns]
735+
736+
header = [summary_column_names[name] for name in names(df)]
737+
alignment = [i == lastindex(header) ? :l : :r for i in 1:length(header)]
738+
highlighters = time_highlighters(df)
739+
pretty_table(io, df; header, alignment, formatters=summary_formatter(df), highlighters, crop)
740+
end
741+
742+
return
651743
end
652744

653745
format_percentage(x::Number) = @sprintf("%.2f%%", x * 100)

test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
1212
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
1313
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
1414
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
15+
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
1516
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
1617
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
1718
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

0 commit comments

Comments
 (0)