From 84f132f433233202f5ee16eae4814841f75607c3 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 15 Apr 2024 13:24:31 -0400 Subject: [PATCH 1/5] Use PrecompileTools to warmup CUDA.jl --- Project.toml | 2 ++ src/precompile.jl | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/Project.toml b/Project.toml index 21c476cbca..1342f7d1bf 100644 --- a/Project.toml +++ b/Project.toml @@ -23,6 +23,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Preferences = "21216c6a-2e73-6563-6e65-726566657250" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -68,6 +69,7 @@ Libdl = "1" LinearAlgebra = "1" Logging = "1" NVTX = "0.3.2" +PrecompileTools = "1.2.1" Preferences = "1" PrettyTables = "2" Printf = "1" diff --git a/src/precompile.jl b/src/precompile.jl index fc95f362ba..67042aa3f7 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -14,3 +14,17 @@ precompile(run_and_collect, (Cmd,)) precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) + +@static if VERSION >= v"1.11.-" +using PrecompileTools: @setup_workload, @compile_workload +@setup_workload let + @compile_workload begin + target = PTXCompilerTarget(; cap=v"7.5") + params = CUDACompilerParams(; cap=v"7.5", ptx=v"7.5") + config = CompilerConfig(target, params) + mi = GPUCompiler.methodinstance(typeof(identity), Tuple{Nothing}) + job = CompilerJob(mi, config) + GPUCompiler.code_native(devnull, job) + end +end +end From 85fc9eacc3b8578464c27bb1a272791b663c96ea Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 17 Apr 2024 16:50:20 -0400 Subject: [PATCH 2/5] fixup! Use PrecompileTools to warmup CUDA.jl --- src/precompile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/precompile.jl b/src/precompile.jl index 67042aa3f7..dd0010839a 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -15,7 +15,7 @@ precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) -@static if VERSION >= v"1.11.-" +@static if VERSION >= v"1.11.0-DEV.1603" using PrecompileTools: @setup_workload, @compile_workload @setup_workload let @compile_workload begin From 389a1f14207ff7887314fb2dcd81f5a9edd886d7 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 17 Apr 2024 16:51:41 -0400 Subject: [PATCH 3/5] fixup! Use PrecompileTools to warmup CUDA.jl --- src/precompile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/precompile.jl b/src/precompile.jl index dd0010839a..ad359dba17 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -15,8 +15,8 @@ precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) -@static if VERSION >= v"1.11.0-DEV.1603" using PrecompileTools: @setup_workload, @compile_workload +@static if VERSION >= v"1.11.0-DEV.1603" @setup_workload let @compile_workload begin target = PTXCompilerTarget(; cap=v"7.5") From 134c9a724fb1fe7f2e49b7501cb97033eaef4a85 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 24 Jun 2024 09:53:19 -0400 Subject: [PATCH 4/5] try precompile tools on all versions --- src/precompile.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/precompile.jl b/src/precompile.jl index ad359dba17..3228a8aab9 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -16,7 +16,6 @@ precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) using PrecompileTools: @setup_workload, @compile_workload -@static if VERSION >= v"1.11.0-DEV.1603" @setup_workload let @compile_workload begin target = PTXCompilerTarget(; cap=v"7.5") @@ -27,4 +26,3 @@ using PrecompileTools: @setup_workload, @compile_workload GPUCompiler.code_native(devnull, job) end end -end From bfe2eb9c847d305fc6dc4804391fc0d220514620 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 24 Jun 2024 09:55:51 -0400 Subject: [PATCH 5/5] Add a note for precompile_workload --- LocalPreferences.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/LocalPreferences.toml b/LocalPreferences.toml index 513fc75593..57209ef082 100644 --- a/LocalPreferences.toml +++ b/LocalPreferences.toml @@ -16,6 +16,9 @@ # possible values: "device", "unified", "host" #default_memory = "device" +# From PrecompileTools, whether or not to precompile the GPUCompiler + Inference stack +#precompile_workload = true + [CUDA_Driver_jll] # whether to attempt to load a forwards-compatibile userspace driver. # only turn this off if you experience issues, e.g., when using a local