1
1
# Profiler control
2
2
3
3
"""
4
- @profile [io=stdout] [host=true] [device=true] [ trace=false] [raw=false] code...
4
+ @profile [io=stdout] [trace=false] [raw=false] code...
5
5
@profile external=true code...
6
6
7
7
Profile the GPU execution of `code`.
@@ -11,22 +11,19 @@ There are two modes of operation, depending on whether `external` is `true` or `
11
11
## Integrated profiler (`external=false`, the default)
12
12
13
13
In this mode, CUDA.jl will profile the execution of `code` and display the result. By
14
- default, both host-side and device-side activity is captured; this can be controlled with
15
- the `host` and `device` keyword arguments. If `trace` is `true`, a chronological trace of
16
- the captured activity will be generated, where the ID column can be used to match host-side
17
- and device-side activity; by default, only a summary will be shown. If `raw` is `true`, all
18
- data will always be included, even if it may not be relevant. The output will be written to
19
- `io`, which defaults to `stdout`.
14
+ default, a summary of host and device-side execution will be show, including any NVTX
15
+ events. To display a chronological trace of the captured activity instead, `trace` can be
16
+ set to `true`. Trace output will include an ID column that can be used to match host-side
17
+ and device-side activity. If `raw` is `true`, all data will always be included, even if it
18
+ may not be relevant. The output will be written to `io`, which defaults to `stdout`.
20
19
21
20
Slow operations will be highlighted in the output: Entries colored in yellow are among the
22
21
slowest 25%, while entries colored in red are among the slowest 5% of all operations.
23
22
24
- !!! compat "Julia 1.9"
25
- This functionality is only available on Julia 1.9 and later.
23
+ !!! compat "Julia 1.9" This functionality is only available on Julia 1.9 and later.
26
24
27
- !!! compat "CUDA 11.2"
28
- Older versions of CUDA, before 11.2, contain bugs that may prevent the
29
- `CUDA.@profile` macro to work. It is recommended to use a newer runtime.
25
+ !!! compat "CUDA 11.2" Older versions of CUDA, before 11.2, contain bugs that may prevent
26
+ the `CUDA.@profile` macro to work. It is recommended to use a newer runtime.
30
27
31
28
## External profilers (`external=true`)
32
29
@@ -187,6 +184,8 @@ function emit_integrated_profile(code, kwargs)
187
184
# memory operations
188
185
CUPTI. CUPTI_ACTIVITY_KIND_MEMCPY,
189
186
CUPTI. CUPTI_ACTIVITY_KIND_MEMSET,
187
+ # NVTX markers
188
+ CUPTI. CUPTI_ACTIVITY_KIND_MARKER,
190
189
]
191
190
if CUDA. runtime_version () >= v " 11.2"
192
191
# additional information for API host calls
@@ -260,6 +259,14 @@ function generate_traces(cfg)
260
259
id = Int[],
261
260
details = String[],
262
261
)
262
+ nvtx_trace = DataFrame (
263
+ id = Int[],
264
+ start = Float64[],
265
+ type = Symbol[],
266
+ tid = Int[],
267
+ name = Union{Missing,String}[],
268
+ domain = Union{Missing,String}[],
269
+ )
263
270
264
271
# memory_kind fields are sometimes typed CUpti_ActivityMemoryKind, sometimes UInt
265
272
as_memory_kind (x) = isa (x, CUPTI. CUpti_ActivityMemoryKind) ? x : CUPTI. CUpti_ActivityMemoryKind (x)
@@ -349,18 +356,44 @@ function generate_traces(cfg)
349
356
stream= record. streamId,
350
357
grid, block, registers,
351
358
static_shmem, dynamic_shmem); cols= :union )
359
+
360
+ # NVTX markers
361
+ elseif record. kind == CUPTI. CUPTI_ACTIVITY_KIND_MARKER
362
+ start = record. timestamp/ 1e9
363
+ name = record. name == C_NULL ? missing : unsafe_string (record. name)
364
+ domain = record. domain == C_NULL ? missing : unsafe_string (record. domain)
365
+
366
+ if record. flags == CUPTI. CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS
367
+ @assert record. objectKind == CUDA. CUPTI. CUPTI_ACTIVITY_OBJECT_THREAD
368
+ tid = record. objectId. pt. threadId
369
+ push! (nvtx_trace, (; record. id, start, tid, type= :instant , name, domain))
370
+ elseif record. flags == CUPTI. CUPTI_ACTIVITY_FLAG_MARKER_START
371
+ @assert record. objectKind == CUDA. CUPTI. CUPTI_ACTIVITY_OBJECT_THREAD
372
+ tid = record. objectId. pt. threadId
373
+ push! (nvtx_trace, (; record. id, start, tid, type= :start , name, domain))
374
+ elseif record. flags == CUPTI. CUPTI_ACTIVITY_FLAG_MARKER_END
375
+ @assert record. objectKind == CUDA. CUPTI. CUPTI_ACTIVITY_OBJECT_THREAD
376
+ tid = record. objectId. pt. threadId
377
+ push! (nvtx_trace, (; record. id, start, tid, type= :end , name, domain))
378
+ else
379
+ @error " Unexpected NVTX marker kind $(Int (record. flags)) . Please file an issue."
380
+ end
352
381
else
353
- error ( " Unexpected CUPTI activity kind: $(record. kind) . Please file an issue." )
382
+ @ error " Unexpected CUPTI activity kind $(Int ( record. kind)) . Please file an issue."
354
383
end
355
384
end
356
385
357
- return host_trace, device_trace, details
386
+ # merge in the details
387
+ host_trace = leftjoin (host_trace, details, on= :id , order= :left )
388
+ device_trace = leftjoin (device_trace, details, on= :id , order= :left )
389
+
390
+ return host_trace, device_trace, nvtx_trace
358
391
end
359
392
360
393
# render traces to a table
361
- function render_traces (host_trace, device_trace, details ;
394
+ function render_traces (host_trace, device_trace, nvtx_trace ;
362
395
io= stdout isa Base. TTY ? IOContext (stdout , :limit => true ) : stdout ,
363
- host = true , device = true , trace= false , raw= false )
396
+ trace= false , raw= false )
364
397
# find the relevant part of the trace (marked by calls to 'cuCtxSynchronize')
365
398
trace_first_sync = findfirst (host_trace. name .== " cuCtxSynchronize" )
366
399
trace_first_sync === nothing && error (" Could not find the start of the profiling trace." )
@@ -400,16 +433,21 @@ function render_traces(host_trace, device_trace, details;
400
433
df. start .- = trace_begin
401
434
df. stop .- = trace_begin
402
435
end
436
+ nvtx_trace. start .- = trace_begin
403
437
if ! raw
404
438
# renumber event IDs from 1
405
- first_id = minimum ([host_trace. id; device_trace. id; details . id ])
406
- for df in (host_trace, device_trace, details )
439
+ first_id = minimum ([host_trace. id; device_trace. id])
440
+ for df in (host_trace, device_trace)
407
441
df. id .- = first_id - 1
408
442
end
409
443
410
444
# renumber thread IDs from 1
411
- first_tid = minimum (host_trace. tid)
412
- host_trace. tid .- = first_tid - 1
445
+ threads = unique ([host_trace. tid; nvtx_trace. tid])
446
+ for df in (host_trace, nvtx_trace)
447
+ broadcast! (df. tid, df. tid) do tid
448
+ findfirst (isequal (tid), threads)
449
+ end
450
+ end
413
451
414
452
end
415
453
@@ -480,7 +518,7 @@ function render_traces(host_trace, device_trace, details;
480
518
" name" => " Name"
481
519
)
482
520
483
- summary_formatter = function (v, i, j)
521
+ summary_formatter (df) = function (v, i, j)
484
522
if names (df)[j] == " time_ratio"
485
523
format_percentage (v)
486
524
elseif names (df)[j] in [" time" , " time_avg" , " time_min" , " time_max" ]
@@ -497,11 +535,11 @@ function render_traces(host_trace, device_trace, details;
497
535
:horizontal
498
536
end
499
537
500
- if host
538
+ # host-side activity
539
+ let
501
540
# to determine the time the host was active, we should look at threads separately
502
541
host_time = maximum (combine (groupby (host_trace, :tid ), :time => sum => :time ). time)
503
542
host_ratio = host_time / trace_time
504
- println (io, " \n Host-side activity: calling CUDA APIs took $(format_time (host_time)) ($(format_percentage (host_ratio)) of the trace)" )
505
543
506
544
# get rid of API call version suffixes
507
545
host_trace. name = replace .(host_trace. name, r" _v\d +$" => " " )
@@ -524,11 +562,28 @@ function render_traces(host_trace, device_trace, details;
524
562
end
525
563
end
526
564
527
- # add in details
528
- df = leftjoin (df, details, on= :id , order= :left )
565
+ # instantaneous NVTX markers can be added to the API trace
566
+ if trace
567
+ markers = copy (nvtx_trace[nvtx_trace. type .== :instant , :])
568
+ markers. id .= missing
569
+ markers. time .= 0.0
570
+ markers. details = map (markers. name, markers. domain) do name, domain
571
+ if name != = missing && domain != = missing
572
+ " $(domain) .$(name) "
573
+ elseif name != = missing
574
+ " $name "
575
+ end
576
+ end
577
+ markers. name .= " NVTX marker"
578
+ append! (df, markers; cols= :subset )
579
+ sort! (df, :start )
580
+ end
529
581
582
+ if ! isempty (df)
583
+ println (io, " \n Host-side activity: calling CUDA APIs took $(format_time (host_time)) ($(format_percentage (host_ratio)) of the trace)" )
584
+ end
530
585
if isempty (df)
531
- println (io, " No host-side activity was recorded." )
586
+ println (io, " \n No host-side activity was recorded." )
532
587
elseif trace
533
588
# determine columns to show, based on whether they contain useful information
534
589
columns = [:id , :start , :time ]
@@ -566,20 +621,23 @@ function render_traces(host_trace, device_trace, details;
566
621
header = [summary_column_names[name] for name in names (df)]
567
622
alignment = [i == lastindex (header) ? :l : :r for i in 1 : length (header)]
568
623
highlighters = time_highlighters (df)
569
- pretty_table (io, df; header, alignment, formatters= summary_formatter, highlighters, crop)
624
+ pretty_table (io, df; header, alignment, formatters= summary_formatter (df) , highlighters, crop)
570
625
end
571
626
end
572
627
573
- if device
628
+ # device-side activity
629
+ let
574
630
device_time = sum (device_trace. time)
575
631
device_ratio = device_time / trace_time
576
- println (io, " \n Device-side activity: GPU was busy for $(format_time (device_time)) ($(format_percentage (device_ratio)) of the trace)" )
632
+ if ! isempty (device_trace)
633
+ println (io, " \n Device-side activity: GPU was busy for $(format_time (device_time)) ($(format_percentage (device_ratio)) of the trace)" )
634
+ end
577
635
578
636
# add memory throughput information
579
637
device_trace. throughput = device_trace. size ./ device_trace. time
580
638
581
639
if isempty (device_trace)
582
- println (io, " No device-side activity was recorded." )
640
+ println (io, " \n No device-side activity was recorded." )
583
641
elseif trace
584
642
# determine columns to show, based on whether they contain useful information
585
643
columns = [:id , :start , :time ]
@@ -645,9 +703,43 @@ function render_traces(host_trace, device_trace, details;
645
703
header = [summary_column_names[name] for name in names (df)]
646
704
alignment = [i == lastindex (header) ? :l : :r for i in 1 : length (header)]
647
705
highlighters = time_highlighters (df)
648
- pretty_table (io, df; header, alignment, formatters= summary_formatter, highlighters, crop)
706
+ pretty_table (io, df; header, alignment, formatters= summary_formatter (df) , highlighters, crop)
649
707
end
650
708
end
709
+
710
+ # show NVTX ranges
711
+ # TODO : do we also want to repeat the host/device summary for each NVTX range?
712
+ # that's what nvprof used to do, but it's a little verbose...
713
+ nvtx_ranges = copy (nvtx_trace[nvtx_trace. type .== :start , :])
714
+ nvtx_ranges = leftjoin (nvtx_ranges, nvtx_trace[nvtx_trace. type .== :end ,
715
+ [:id , :start ]],
716
+ on= :id , makeunique= true )
717
+ if ! isempty (nvtx_ranges)
718
+ println (io, " \n NVTX ranges:" )
719
+
720
+ rename! (nvtx_ranges, :start_1 => :stop )
721
+ nvtx_ranges. id .= missing
722
+ nvtx_ranges. time .= nvtx_ranges. stop .- nvtx_ranges. start
723
+ nvtx_ranges. name = map (nvtx_ranges. name, nvtx_ranges. domain) do name, domain
724
+ if name != = missing && domain != = missing
725
+ " $(domain) .$(name) "
726
+ elseif name != = missing
727
+ " $name "
728
+ end
729
+ end
730
+
731
+ df = summarize_trace (nvtx_ranges)
732
+
733
+ columns = [:time_ratio , :time , :calls , :time_avg , :time_min , :time_max , :name ]
734
+ df = df[:, columns]
735
+
736
+ header = [summary_column_names[name] for name in names (df)]
737
+ alignment = [i == lastindex (header) ? :l : :r for i in 1 : length (header)]
738
+ highlighters = time_highlighters (df)
739
+ pretty_table (io, df; header, alignment, formatters= summary_formatter (df), highlighters, crop)
740
+ end
741
+
742
+ return
651
743
end
652
744
653
745
format_percentage (x:: Number ) = @sprintf (" %.2f%%" , x * 100 )
0 commit comments