Skip to content

Commit b210f61

Browse files
authored
Update benchmarks for 1.8 and 1.9 (#1933)
1 parent acd245e commit b210f61

File tree

6 files changed

+16
-25
lines changed

6 files changed

+16
-25
lines changed

.buildkite/pipeline.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,8 @@ steps:
304304
julia:
305305
- "1.6"
306306
- "1.7"
307+
- "1.8"
308+
- "1.9"
307309
timeout_in_minutes: 30
308310

309311
env:

lib/cudadrv/state.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ end
6969
function task_local_state!(args...)
7070
tls = task_local_storage()
7171
if haskey(tls, :CUDA)
72-
validate_task_local_state(@inbounds(tls[:CUDA]))
72+
validate_task_local_state(@inbounds(tls[:CUDA])::TaskLocalState)
7373
else
7474
# verify that CUDA.jl is functional. this doesn't belong here, but since we can't
7575
# error during `__init__`, we do it here instead as this is the first function

perf/kernel.jl

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,11 @@ using CUDA: i32
22

33
group = addgroup!(SUITE, "kernel")
44

5-
dummy_kernel() = nothing
6-
group["launch"] = @benchmarkable @cuda dummy_kernel()
5+
group["launch"] = @benchmarkable @cuda identity(nothing)
76

8-
wanted_threads = 10000
97
group["occupancy"] = @benchmarkable begin
10-
kernel = @cuda launch=false dummy_kernel()
11-
config = launch_configuration(kernel.fun)
12-
threads = min($wanted_threads, config.threads)
13-
blocks = cld($wanted_threads, threads)
8+
kernel = @cuda launch=false identity(nothing)
9+
launch_configuration(kernel.fun)
1410
end
1511

1612
src = CUDA.rand(Float32, 512, 1000)

perf/latency.jl

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,20 @@ function main()
1515

1616
# time to precompile the package and its dependencies
1717
precompile_cmd =
18-
`$base_cmd -e "uuid = Base.UUID(\"052768ef-5323-5732-b1bb-66c8b64840ba\")
19-
id = Base.PkgId(uuid, \"CUDA\")
20-
Base.compilecache(id)"`
18+
`$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
19+
Base.compilecache(pkg)"`
2120
results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
2221

2322
# time to actually import the package
2423
import_cmd =
2524
`$base_cmd -e "using CUDA"`
2625
results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
2726

28-
# time to initialize CUDA and all other libraries
29-
initialize_time =
30-
`$base_cmd -e "using CUDA
31-
CUDA.driver_version()"`
32-
results["initialize"] = @benchmark run($initialize_time) evals=1 seconds=30
33-
3427
# time to actually compile a kernel
3528
ttfp_cmd =
3629
`$base_cmd -e "using CUDA
37-
kernel() = return
38-
CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
30+
kernel() = return
31+
CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
3932
results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
4033

4134
results

src/compiler/compilation.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@ GPUCompiler.kernel_state_type(job::CUDACompilerJob) = KernelState
4747
## compiler implementation (cache, configure, compile, and link)
4848

4949
# cache of compilation caches, per context
50-
const _compiler_caches = Dict{CuContext, Dict{Any, Any}}();
50+
const _compiler_caches = Dict{CuContext, Dict{Any, CuFunction}}();
5151
function compiler_cache(ctx::CuContext)
5252
cache = get(_compiler_caches, ctx, nothing)
5353
if cache === nothing
54-
cache = Dict{Any, Any}()
54+
cache = Dict{Any, CuFunction}()
5555
_compiler_caches[ctx] = cache
5656
end
5757
return cache

src/compiler/execution.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -319,22 +319,22 @@ function cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
319319

320320
# create a callable object that captures the function instance. we don't need to think
321321
# about world age here, as GPUCompiler already does and will return a different object
322-
h = hash(fun, hash(f, hash(tt)))
323-
kernel = get(_kernel_instances, h, nothing)
322+
key = (objectid(source), hash(fun), f)
323+
kernel = get(_kernel_instances, key, nothing)
324324
if kernel === nothing
325325
# create the kernel state object
326326
exception_ptr = create_exceptions!(fun.mod)
327327
state = KernelState(exception_ptr)
328328

329329
kernel = HostKernel{F,tt}(f, fun, state)
330-
_kernel_instances[h] = kernel
330+
_kernel_instances[key] = kernel
331331
end
332332
return kernel::HostKernel{F,tt}
333333
end
334334
end
335335

336336
# cache of kernel instances
337-
const _kernel_instances = Dict{UInt, Any}()
337+
const _kernel_instances = Dict{Any, Any}()
338338

339339
function (kernel::HostKernel)(args...; threads::CuDim=1, blocks::CuDim=1, kwargs...)
340340
call(kernel, map(cudaconvert, args)...; threads, blocks, kwargs...)

0 commit comments

Comments
 (0)