@@ -296,17 +296,17 @@ The output of this function is automatically cached, i.e. you can simply call `c
296
296
in a hot path without degrading performance. New code will be generated automatically, when
297
297
when function changes, or when different types or keyword arguments are provided.
298
298
"""
299
- function cufunction (f:: F , tt:: TT = Tuple{}; name = nothing , always_inline = false , kwargs... ) where {F,TT}
299
+ function cufunction (f:: F , tt:: TT = Tuple{}; kwargs... ) where {F,TT}
300
300
cuda = active_state ()
301
+
302
+ # compile the function
301
303
cache = cufunction_cache (cuda. context)
302
- target = CUDACompilerTarget (cuda. device; kwargs... )
303
- params = CUDACompilerParams ()
304
- config = CompilerConfig (target, params; kernel= true , name, always_inline)
304
+ config = cufunction_compiler (cuda. device; kwargs... ):: CUDACompilerConfig
305
305
fun = GPUCompiler. cached_compilation (cache, config, F, tt,
306
- cufunction_compile,
307
- cufunction_link)
308
- # compilation is cached on the function type, so we can only create a kernel object here
309
- # (as it captures the function _instance_). this allocates, so use another cache level.
306
+ cufunction_compile, cufunction_link)
307
+
308
+ # create a callable object that captures the function instance. we don't need to think
309
+ # about world age here, as GPUCompiler already does and will return a different object
310
310
h = hash (fun, hash (f, hash (tt)))
311
311
kernel = get (_cufunction_kernel_cache, h, nothing )
312
312
if kernel === nothing
@@ -331,6 +331,23 @@ function cufunction_cache(ctx::CuContext)
331
331
return subcache
332
332
end
333
333
334
+ const _cufunction_compiler_cache = Dict {UInt, CUDACompilerConfig} ()
335
+ function cufunction_compiler (dev; kwargs... )
336
+ h = hash (dev, hash (kwargs))
337
+ config = get (_cufunction_compiler_cache, h, nothing )
338
+ if config === nothing
339
+ config = cufunction_compiler_create (dev; kwargs... )
340
+ _cufunction_compiler_cache[h] = config
341
+ end
342
+ return config
343
+ end
344
+ @noinline function cufunction_compiler_create (dev; name= nothing , always_inline= false , kwargs... )
345
+ # TODO : merge with `device_properties`
346
+ target = CUDACompilerTarget (dev; kwargs... )
347
+ params = CUDACompilerParams ()
348
+ CompilerConfig (target, params; kernel= true , name, always_inline)
349
+ end
350
+
334
351
const _cufunction_kernel_cache = Dict {UInt, Any} ();
335
352
336
353
# helper to run a binary and collect all relevant output
0 commit comments