Skip to content

Commit a90cba1

Browse files
authored
Profiler tweaks. (#2432)
1 parent 40bcc37 commit a90cba1

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

src/profile.jl

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,8 @@ using Printf
128128
#
129129

130130
function profile_externally(f)
131-
# wait for the device to become idle (and trigger a GC to avoid interference)
131+
# wait for the device to become idle
132132
CUDA.cuCtxSynchronize()
133-
GC.gc(false)
134-
GC.gc(true)
135133

136134
start()
137135
try
@@ -293,11 +291,13 @@ function profile_internally(f; concurrent=true, kwargs...)
293291
end
294292
cfg = CUPTI.ActivityConfig(activity_kinds)
295293

296-
# wait for the device to become idle (and trigger a GC to avoid interference)
294+
# wait for the device to become idle
297295
CUDA.cuCtxSynchronize()
298296

299297
CUPTI.enable!(cfg) do
300-
# sink the initial profiler overhead into a synchronization call
298+
# perform dummy operations to "warm up" the profiler, and avoid slow first calls.
299+
# we'll skip everything up until the synchronization call during processing
300+
CuArray([1])
301301
CUDA.cuCtxSynchronize()
302302

303303
f()
@@ -710,7 +710,8 @@ function Base.show(io::IO, results::ProfileResults)
710710
# called a lot during compilation
711711
"cuDeviceGetAttribute",
712712
# done before every memory operation
713-
"cuPointerGetAttribute", "cuDeviceGetMemPool"])
713+
"cuPointerGetAttribute", "cuDeviceGetMemPool",
714+
"cuStreamGetCaptureInfo"])
714715
end
715716
end
716717

0 commit comments

Comments
 (0)