Update benchmarks for 1.8 and 1.9 (#1933)

maleadt · web-flow · commit b210f614ff86 · 2023-06-02T15:45:03.000+02:00
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -304,6 +304,8 @@ steps:
             julia:
               - "1.6"
               - "1.7"
+              - "1.8"
+              - "1.9"
         timeout_in_minutes: 30
 
 env:
diff --git a/lib/cudadrv/state.jl b/lib/cudadrv/state.jl
@@ -69,7 +69,7 @@ end
 function task_local_state!(args...)
     tls = task_local_storage()
     if haskey(tls, :CUDA)
-        validate_task_local_state(@inbounds(tls[:CUDA]))
+        validate_task_local_state(@inbounds(tls[:CUDA])::TaskLocalState)
     else
         # verify that CUDA.jl is functional. this doesn't belong here, but since we can't
         # error during `__init__`, we do it here instead as this is the first function
diff --git a/perf/kernel.jl b/perf/kernel.jl
@@ -2,15 +2,11 @@ using CUDA: i32
 
 group = addgroup!(SUITE, "kernel")
 
-dummy_kernel() = nothing
-group["launch"] = @benchmarkable @cuda dummy_kernel()
+group["launch"] = @benchmarkable @cuda identity(nothing)
 
-wanted_threads = 10000
 group["occupancy"] = @benchmarkable begin
-    kernel = @cuda launch=false dummy_kernel()
-    config = launch_configuration(kernel.fun)
-    threads = min($wanted_threads, config.threads)
-    blocks = cld($wanted_threads, threads)
+    kernel = @cuda launch=false identity(nothing)
+    launch_configuration(kernel.fun)
 end
 
 src = CUDA.rand(Float32, 512, 1000)
diff --git a/perf/latency.jl b/perf/latency.jl
@@ -15,27 +15,20 @@ function main()
 
     # time to precompile the package and its dependencies
     precompile_cmd =
-        `$base_cmd -e "uuid = Base.UUID(\"052768ef-5323-5732-b1bb-66c8b64840ba\")
-                    id = Base.PkgId(uuid, \"CUDA\")
-                    Base.compilecache(id)"`
+        `$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
+                       Base.compilecache(pkg)"`
     results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
 
     # time to actually import the package
     import_cmd =
         `$base_cmd -e "using CUDA"`
     results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
 
-    # time to initialize CUDA and all other libraries
-    initialize_time =
-        `$base_cmd -e "using CUDA
-                       CUDA.driver_version()"`
-    results["initialize"] = @benchmark run($initialize_time) evals=1 seconds=30
-
     # time to actually compile a kernel
     ttfp_cmd =
         `$base_cmd -e "using CUDA
-                    kernel() = return
-                    CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
+                       kernel() = return
+                       CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
     results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
 
     results
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -47,11 +47,11 @@ GPUCompiler.kernel_state_type(job::CUDACompilerJob) = KernelState
 ## compiler implementation (cache, configure, compile, and link)
 
 # cache of compilation caches, per context
-const _compiler_caches = Dict{CuContext, Dict{Any, Any}}();
+const _compiler_caches = Dict{CuContext, Dict{Any, CuFunction}}();
 function compiler_cache(ctx::CuContext)
     cache = get(_compiler_caches, ctx, nothing)
     if cache === nothing
-        cache = Dict{Any, Any}()
+        cache = Dict{Any, CuFunction}()
         _compiler_caches[ctx] = cache
     end
     return cache
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -319,22 +319,22 @@ function cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
 
         # create a callable object that captures the function instance. we don't need to think
         # about world age here, as GPUCompiler already does and will return a different object
-        h = hash(fun, hash(f, hash(tt)))
-        kernel = get(_kernel_instances, h, nothing)
+        key = (objectid(source), hash(fun), f)
+        kernel = get(_kernel_instances, key, nothing)
         if kernel === nothing
             # create the kernel state object
             exception_ptr = create_exceptions!(fun.mod)
             state = KernelState(exception_ptr)
 
             kernel = HostKernel{F,tt}(f, fun, state)
-            _kernel_instances[h] = kernel
+            _kernel_instances[key] = kernel
         end
         return kernel::HostKernel{F,tt}
     end
 end
 
 # cache of kernel instances
-const _kernel_instances = Dict{UInt, Any}()
+const _kernel_instances = Dict{Any, Any}()
 
 function (kernel::HostKernel)(args...; threads::CuDim=1, blocks::CuDim=1, kwargs...)
     call(kernel, map(cudaconvert, args)...; threads, blocks, kwargs...)

-Original file line number
+Diff line change
             julia:
               - "1.6"
               - "1.7"
 +              - "1.8"
 +              - "1.9"
         timeout_in_minutes: 30
 env: