From d9f431ebffe4feff17964f4e3750734587f465a7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 4 Apr 2023 13:24:47 +0200 Subject: [PATCH 1/7] Add an experimental opaque closure type. --- src/compiler/compilation.jl | 118 ++++++++++++++++++++++++++++++++++++ test/core/execution.jl | 46 ++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 622f11cd61..a861086e7d 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -425,3 +425,121 @@ function run_and_collect(cmd) return proc, log end + + + +## opaque closures + +# TODO: once stabilised, move bits of this into GPUCompiler.jl + +using Core.Compiler: IRCode +using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode + +struct OpaqueClosure{F, E, A, R} # func, env, args, ret + env::E +end + +# XXX: because we can't call functions from other CUDA modules, we effectively need to +# recompile when the target function changes. this, and because of how GPUCompiler's +# deferred compilation mechanism currently works, is why we have `F` as a type param. + +# XXX: because of GPU code requiring specialized signatures, we also need to recompile +# when the environment or argument types change. together with the above, this +# negates much of the benefit of opaque closures. + +# TODO: support for constructing an opaque closure from source code + +# TODO: complete support for passing an environment. this probably requires a split into +# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray. + +function compute_ir_rettype(ir::IRCode) + rt = Union{} + for i = 1:length(ir.stmts) + stmt = ir.stmts[i][:inst] + if isa(stmt, Core.Compiler.ReturnNode) && isdefined(stmt, :val) + rt = Core.Compiler.tmerge(Core.Compiler.argextype(stmt.val, ir), rt) + end + end + return Core.Compiler.widenconst(rt) +end + +function compute_oc_signature(ir::IRCode, nargs::Int, isva::Bool) + argtypes = Vector{Any}(undef, nargs) + for i = 1:nargs + argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1]) + end + if isva + lastarg = pop!(argtypes) + if lastarg <: Tuple + append!(argtypes, lastarg.parameters) + else + push!(argtypes, Vararg{Any}) + end + end + return Tuple{argtypes...} +end + +function OpaqueClosure(ir::IRCode, @nospecialize env...; isva::Bool = false) + # NOTE: we need ir.argtypes[1] == typeof(env) + ir = Core.Compiler.copy(ir) + nargs = length(ir.argtypes)-1 + sig = compute_oc_signature(ir, nargs, isva) + rt = compute_ir_rettype(ir) + src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ()) + src.slotnames = Base.fill(:none, nargs+1) + src.slotflags = Base.fill(zero(UInt8), length(ir.argtypes)) + src.slottypes = copy(ir.argtypes) + src.rettype = rt + src = Core.Compiler.ir_to_codeinf!(src, ir) + config = compiler_config(device(); kernel=false) + return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...) +end + +function OpaqueGPUClosure(src::CodeInfo, @nospecialize env...) + src.inferred || throw(ArgumentError("Expected inferred src::CodeInfo")) + mi = src.parent::Core.MethodInstance + sig = Base.tuple_type_tail(mi.specTypes) + method = mi.def::Method + nargs = method.nargs-1 + isva = method.isva + return generate_opaque_closure(config, src, sig, src.rettype, nargs, isva, env...) +end + +function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, + @nospecialize(sig), @nospecialize(rt), + nargs::Int, isva::Bool, @nospecialize env...; + mod::Module=@__MODULE__, + file::Union{Nothing,Symbol}=nothing, line::Int=0) + # create a method (like `jl_make_opaque_closure_method`) + meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main) + meth.sig = Tuple + meth.isva = isva # XXX: probably not supported? + meth.is_for_opaque_closure = 0 # XXX: do we want this? + meth.name = Symbol("opaque gpu closure") + meth.nargs = nargs + 1 + meth.file = something(file, Symbol()) + meth.line = line + ccall(:jl_method_set_source, Nothing, (Any, Any), meth, src) + + # look up a method instance and create a compiler job + full_sig = Tuple{typeof(env), sig.parameters...} + mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, + (Any, Any, Any), meth, full_sig, Core.svec()) + job = CompilerJob(mi, config) # this captures the current world age + + # create a code instance and store it in the cache + ci = CodeInstance(mi, rt, C_NULL, src, Int32(0), meth.primary_world, typemax(UInt), + UInt32(0), UInt32(0), nothing, UInt8(0)) + Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi) + + id = length(GPUCompiler.deferred_codegen_jobs) + 1 + GPUCompiler.deferred_codegen_jobs[id] = job + return OpaqueClosure{id, typeof(env), sig, rt}(env) +end + +# device-side call to an opaque closure +function (oc::OpaqueClosure{F})(a, b) where F + ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), F) + assume(ptr != C_NULL) + return ccall(ptr, Int, (Int, Int), a, b) +end diff --git a/test/core/execution.jl b/test/core/execution.jl index 101f6d1194..a7ab111b75 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -1095,3 +1095,49 @@ end end ############################################################################################ + +if VERSION >= v"1.10-" +@testset "opaque closures" begin + +# basic closure, constructed from IRCode +let + ir, rettyp = only(Base.code_ircode(+, (Int, Int))) + oc = CUDA.OpaqueClosure(ir) + + c = CuArray([0]) + a = CuArray([1]) + b = CuArray([2]) + + function kernel(oc, c, a, b) + i = threadIdx().x + @inbounds c[i] = oc(a[i], b[i]) + return + end + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 3 +end + +# basic closure, constructed from CodeInfo +let + ir, rettyp = only(Base.code_typed(+, (Int, Int))) + oc = CUDA.OpaqueClosure(ir) + + c = CuArray([0]) + a = CuArray([1]) + b = CuArray([2]) + + function kernel(oc, c, a, b) + i = threadIdx().x + @inbounds c[i] = oc(a[i], b[i]) + return + end + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 3 +end + +end +end + +############################################################################################ From 243dbac40c76789ef8b5f0467c98a3e3aa107e3a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 14 Apr 2023 13:14:57 +0200 Subject: [PATCH 2/7] Fix construction from CodeInfo. --- src/compiler/compilation.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index a861086e7d..138591ac0c 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -495,13 +495,14 @@ function OpaqueClosure(ir::IRCode, @nospecialize env...; isva::Bool = false) return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...) end -function OpaqueGPUClosure(src::CodeInfo, @nospecialize env...) +function OpaqueClosure(src::CodeInfo, @nospecialize env...) src.inferred || throw(ArgumentError("Expected inferred src::CodeInfo")) mi = src.parent::Core.MethodInstance sig = Base.tuple_type_tail(mi.specTypes) method = mi.def::Method nargs = method.nargs-1 isva = method.isva + config = compiler_config(device(); kernel=false) return generate_opaque_closure(config, src, sig, src.rettype, nargs, isva, env...) end From a9a3be255b56f74223ff5dd50fc559a331364537 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 14 Apr 2023 13:15:07 +0200 Subject: [PATCH 3/7] Generalize to arbitrary args. --- src/compiler/compilation.jl | 36 ++++++++++++++++++++++++++++++++++-- test/core/execution.jl | 17 +++++++++-------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 138591ac0c..1009ce8aa1 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -538,9 +538,41 @@ function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, return OpaqueClosure{id, typeof(env), sig, rt}(env) end +# generated function `ccall`, working around the restriction that ccall type +# tuples need to be literals. this relies on ccall internals... +@inline @generated function generated_ccall(f::Ptr, _rettyp, _types, vals...) + ex = quote end + + rettyp = _rettyp.parameters[1] + types = _types.parameters[1].parameters + args = [:(vals[$i]) for i in 1:length(vals)] + + # cconvert + cconverted = [Symbol("cconverted_$i") for i in 1:length(vals)] + for (dst, typ, src) in zip(cconverted, types, args) + append!(ex.args, (quote + $dst = Base.cconvert($typ, $src) + end).args) + end + + # unsafe_convert + unsafe_converted = [Symbol("unsafe_converted_$i") for i in 1:length(vals)] + for (dst, typ, src) in zip(unsafe_converted, types, cconverted) + append!(ex.args, (quote + $dst = Base.unsafe_convert($typ, $src) + end).args) + end + + call = Expr(:foreigncall, :f, rettyp, Core.svec(types...), 0, + QuoteNode(:ccall), unsafe_converted..., cconverted...) + push!(ex.args, call) + return ex +end + # device-side call to an opaque closure -function (oc::OpaqueClosure{F})(a, b) where F +function (oc::OpaqueClosure{F,E,A,R})(args...) where {F,E,A,R} ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), F) assume(ptr != C_NULL) - return ccall(ptr, Int, (Int, Int), a, b) + #ccall(ptr, R, (A...), args...) + generated_ccall(ptr, R, A, args...) end diff --git a/test/core/execution.jl b/test/core/execution.jl index a7ab111b75..3809be114e 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -1120,21 +1120,22 @@ end # basic closure, constructed from CodeInfo let - ir, rettyp = only(Base.code_typed(+, (Int, Int))) + ir, rettyp = only(Base.code_typed(*, (Int, Int, Int))) oc = CUDA.OpaqueClosure(ir) - c = CuArray([0]) - a = CuArray([1]) - b = CuArray([2]) + d = CuArray([1]) + a = CuArray([2]) + b = CuArray([3]) + c = CuArray([4]) - function kernel(oc, c, a, b) + function kernel(oc, d, a, b, c) i = threadIdx().x - @inbounds c[i] = oc(a[i], b[i]) + @inbounds d[i] = oc(a[i], b[i], c[i]) return end - @cuda threads=1 kernel(oc, c, a, b) + @cuda threads=1 kernel(oc, d, a, b, c) - @test Array(c)[] == 3 + @test Array(d)[] == 24 end end From 774eb4a33c9bb2eb78b622a85448b4ebbf8ba971 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 14 Apr 2023 14:10:50 +0200 Subject: [PATCH 4/7] Work around SciML bug. --- src/compiler/compilation.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 1009ce8aa1..66172a3958 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -570,7 +570,10 @@ end end # device-side call to an opaque closure -function (oc::OpaqueClosure{F,E,A,R})(args...) where {F,E,A,R} +(oc::OpaqueClosure)(args...) = call(oc, args...) +## NOTE: split into two to make `SciML.isinplace(oc)` work. +## it also resembles how kernels are called. +@inline function call(oc::OpaqueClosure{F,E,A,R}, args...) where {F,E,A,R} ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), F) assume(ptr != C_NULL) #ccall(ptr, R, (A...), args...) From e8fe2a905dc93b3e3532b2afa529ade663055e1a Mon Sep 17 00:00:00 2001 From: oscarddssmith Date: Tue, 2 Apr 2024 12:03:53 -0400 Subject: [PATCH 5/7] Updates for 1.12. --- src/compiler/compilation.jl | 47 ++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 66172a3958..0b731852b3 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -455,7 +455,7 @@ end function compute_ir_rettype(ir::IRCode) rt = Union{} for i = 1:length(ir.stmts) - stmt = ir.stmts[i][:inst] + stmt = ir[Core.SSAValue(i)][:stmt] if isa(stmt, Core.Compiler.ReturnNode) && isdefined(stmt, :val) rt = Core.Compiler.tmerge(Core.Compiler.argextype(stmt.val, ir), rt) end @@ -479,31 +479,34 @@ function compute_oc_signature(ir::IRCode, nargs::Int, isva::Bool) return Tuple{argtypes...} end -function OpaqueClosure(ir::IRCode, @nospecialize env...; isva::Bool = false) +function OpaqueClosure(ir::IRCode, @nospecialize env...; + isva::Bool = false, + slotnames::Union{Nothing,Vector{Symbol}}=nothing) # NOTE: we need ir.argtypes[1] == typeof(env) ir = Core.Compiler.copy(ir) - nargs = length(ir.argtypes)-1 + # if the user didn't specify a definition MethodInstance or filename Symbol to use for the debuginfo, set a filename now + ir.debuginfo.def === nothing && (ir.debuginfo.def = :var"generated IR for OpaqueClosure") + nargtypes = length(ir.argtypes) + nargs = nargtypes-1 sig = compute_oc_signature(ir, nargs, isva) rt = compute_ir_rettype(ir) src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ()) - src.slotnames = Base.fill(:none, nargs+1) - src.slotflags = Base.fill(zero(UInt8), length(ir.argtypes)) + if slotnames === nothing + src.slotnames = Base.fill(:none, nargtypes) + else + length(slotnames) == nargtypes || error("mismatched `argtypes` and `slotnames`") + src.slotnames = slotnames + end + src.slotflags = Base.fill(zero(UInt8), nargtypes) src.slottypes = copy(ir.argtypes) - src.rettype = rt src = Core.Compiler.ir_to_codeinf!(src, ir) config = compiler_config(device(); kernel=false) return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...) end -function OpaqueClosure(src::CodeInfo, @nospecialize env...) - src.inferred || throw(ArgumentError("Expected inferred src::CodeInfo")) - mi = src.parent::Core.MethodInstance - sig = Base.tuple_type_tail(mi.specTypes) - method = mi.def::Method - nargs = method.nargs-1 - isva = method.isva +function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs, isva=false) config = compiler_config(device(); kernel=false) - return generate_opaque_closure(config, src, sig, src.rettype, nargs, isva, env...) + return generate_opaque_closure(config, src, sig, rettype, nargs, isva, env...) end function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, @@ -529,8 +532,20 @@ function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, job = CompilerJob(mi, config) # this captures the current world age # create a code instance and store it in the cache - ci = CodeInstance(mi, rt, C_NULL, src, Int32(0), meth.primary_world, typemax(UInt), - UInt32(0), UInt32(0), nothing, UInt8(0)) + interp = GPUCompiler.get_interpreter(job) + owner = Core.Compiler.cache_owner(interp) + exctype = Any + inferred_const = C_NULL + const_flags = Int32(0) + min_world = meth.primary_world + max_world = typemax(UInt) + ipo_effects = UInt32(0) + effects = UInt32(0) + analysis_results = nothing + relocatability = UInt8(0) + ci = CodeInstance(mi, owner, rt, exctype, inferred_const, const_flags, + const_flags, min_world, max_world, ipo_effects, effects, + analysis_results, relocatability, src.debuginfo) Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi) id = length(GPUCompiler.deferred_codegen_jobs) + 1 From c4d7db35fade4d8e7d2254fcaf3b95c9c5285ddd Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 16 Apr 2024 16:34:43 +0200 Subject: [PATCH 6/7] Fixes. --- src/compiler/compilation.jl | 7 +++++-- test/core/execution.jl | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 0b731852b3..0f7d21cd03 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -530,20 +530,23 @@ function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, (Any, Any, Any), meth, full_sig, Core.svec()) job = CompilerJob(mi, config) # this captures the current world age + Base.@atomic meth.primary_world = job.world + Base.@atomic meth.deleted_world = typemax(UInt) # create a code instance and store it in the cache interp = GPUCompiler.get_interpreter(job) owner = Core.Compiler.cache_owner(interp) exctype = Any inferred_const = C_NULL + inferred = src const_flags = Int32(0) min_world = meth.primary_world - max_world = typemax(UInt) + max_world = meth.deleted_world ipo_effects = UInt32(0) effects = UInt32(0) analysis_results = nothing relocatability = UInt8(0) - ci = CodeInstance(mi, owner, rt, exctype, inferred_const, const_flags, + ci = CodeInstance(mi, owner, rt, exctype, inferred_const, inferred, const_flags, min_world, max_world, ipo_effects, effects, analysis_results, relocatability, src.debuginfo) Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi) diff --git a/test/core/execution.jl b/test/core/execution.jl index 3809be114e..3d23b97ca5 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -1096,7 +1096,7 @@ end ############################################################################################ -if VERSION >= v"1.10-" +if VERSION >= v"1.12-" @testset "opaque closures" begin # basic closure, constructed from IRCode @@ -1120,8 +1120,8 @@ end # basic closure, constructed from CodeInfo let - ir, rettyp = only(Base.code_typed(*, (Int, Int, Int))) - oc = CUDA.OpaqueClosure(ir) + ir, rettype = only(Base.code_typed(*, (Int, Int, Int))) + oc = CUDA.OpaqueClosure(ir; sig=Tuple{Int,Int,Int}, rettype, nargs=3) d = CuArray([1]) a = CuArray([2]) From 7782216d8cc6b4656f79b3ed3018328fe4c81381 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 17 Apr 2024 13:40:21 +0200 Subject: [PATCH 7/7] Add support for dynamically-constructed opaque closures. --- src/compiler/compilation.jl | 229 +++++++++++++++++++++++++----------- test/core/execution.jl | 41 ++++++- 2 files changed, 200 insertions(+), 70 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 0f7d21cd03..81fca77ede 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -435,22 +435,7 @@ end using Core.Compiler: IRCode using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode -struct OpaqueClosure{F, E, A, R} # func, env, args, ret - env::E -end - -# XXX: because we can't call functions from other CUDA modules, we effectively need to -# recompile when the target function changes. this, and because of how GPUCompiler's -# deferred compilation mechanism currently works, is why we have `F` as a type param. - -# XXX: because of GPU code requiring specialized signatures, we also need to recompile -# when the environment or argument types change. together with the above, this -# negates much of the benefit of opaque closures. - -# TODO: support for constructing an opaque closure from source code - -# TODO: complete support for passing an environment. this probably requires a split into -# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray. +# helpers function compute_ir_rettype(ir::IRCode) rt = Union{} @@ -463,32 +448,25 @@ function compute_ir_rettype(ir::IRCode) return Core.Compiler.widenconst(rt) end -function compute_oc_signature(ir::IRCode, nargs::Int, isva::Bool) +function compute_oc_signature(ir::IRCode, nargs::Int) argtypes = Vector{Any}(undef, nargs) for i = 1:nargs argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1]) end - if isva - lastarg = pop!(argtypes) - if lastarg <: Tuple - append!(argtypes, lastarg.parameters) - else - push!(argtypes, Vararg{Any}) - end - end return Tuple{argtypes...} end -function OpaqueClosure(ir::IRCode, @nospecialize env...; - isva::Bool = false, - slotnames::Union{Nothing,Vector{Symbol}}=nothing) +function make_oc_codeinfo(ir::IRCode, @nospecialize env...; slotnames=nothing) # NOTE: we need ir.argtypes[1] == typeof(env) ir = Core.Compiler.copy(ir) - # if the user didn't specify a definition MethodInstance or filename Symbol to use for the debuginfo, set a filename now - ir.debuginfo.def === nothing && (ir.debuginfo.def = :var"generated IR for OpaqueClosure") + # if the user didn't specify a definition MethodInstance or filename Symbol to use + # for the debuginfo, set a filename now + if ir.debuginfo.def === nothing + ir.debuginfo.def = Symbol("IR for opaque gpu closure") + end nargtypes = length(ir.argtypes) nargs = nargtypes-1 - sig = compute_oc_signature(ir, nargs, isva) + sig = compute_oc_signature(ir, nargs) rt = compute_ir_rettype(ir) src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ()) if slotnames === nothing @@ -499,61 +477,39 @@ function OpaqueClosure(ir::IRCode, @nospecialize env...; end src.slotflags = Base.fill(zero(UInt8), nargtypes) src.slottypes = copy(ir.argtypes) - src = Core.Compiler.ir_to_codeinf!(src, ir) - config = compiler_config(device(); kernel=false) - return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...) -end - -function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs, isva=false) - config = compiler_config(device(); kernel=false) - return generate_opaque_closure(config, src, sig, rettype, nargs, isva, env...) + Core.Compiler.ir_to_codeinf!(src, ir) end -function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, - @nospecialize(sig), @nospecialize(rt), - nargs::Int, isva::Bool, @nospecialize env...; - mod::Module=@__MODULE__, - file::Union{Nothing,Symbol}=nothing, line::Int=0) - # create a method (like `jl_make_opaque_closure_method`) +# create a method (like `jl_make_oc_method`) +function make_oc_method(nargs; file=nothing, line=0, world=GPUCompiler.tls_world_age()) meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main) meth.sig = Tuple - meth.isva = isva # XXX: probably not supported? - meth.is_for_opaque_closure = 0 # XXX: do we want this? + meth.isva = false + meth.is_for_opaque_closure = 0 meth.name = Symbol("opaque gpu closure") meth.nargs = nargs + 1 meth.file = something(file, Symbol()) meth.line = line - ccall(:jl_method_set_source, Nothing, (Any, Any), meth, src) - - # look up a method instance and create a compiler job - full_sig = Tuple{typeof(env), sig.parameters...} - mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, - (Any, Any, Any), meth, full_sig, Core.svec()) - job = CompilerJob(mi, config) # this captures the current world age - Base.@atomic meth.primary_world = job.world + Base.@atomic meth.primary_world = world Base.@atomic meth.deleted_world = typemax(UInt) + return meth +end - # create a code instance and store it in the cache - interp = GPUCompiler.get_interpreter(job) +function make_oc_codeinstance(mi::MethodInstance, src::CodeInfo; interp, world, rt) owner = Core.Compiler.cache_owner(interp) exctype = Any inferred_const = C_NULL inferred = src const_flags = Int32(0) - min_world = meth.primary_world - max_world = meth.deleted_world + min_world = world + max_world = typemax(UInt) ipo_effects = UInt32(0) effects = UInt32(0) analysis_results = nothing relocatability = UInt8(0) - ci = CodeInstance(mi, owner, rt, exctype, inferred_const, inferred, - const_flags, min_world, max_world, ipo_effects, effects, - analysis_results, relocatability, src.debuginfo) - Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi) - - id = length(GPUCompiler.deferred_codegen_jobs) + 1 - GPUCompiler.deferred_codegen_jobs[id] = job - return OpaqueClosure{id, typeof(env), sig, rt}(env) + CodeInstance(mi, owner, rt, exctype, inferred_const, inferred, + const_flags, min_world, max_world, ipo_effects, effects, + analysis_results, relocatability, src.debuginfo) end # generated function `ccall`, working around the restriction that ccall type @@ -587,7 +543,60 @@ end return ex end -# device-side call to an opaque closure +# static opaque closures + +# XXX: because we can't call functions from other CUDA modules, we effectively need to +# recompile when the target function changes. this, and because of how GPUCompiler's +# deferred compilation mechanism currently works, is why we have `F` as a type param. + +# XXX: because of GPU code requiring specialized signatures, we also need to recompile +# when the environment or argument types change. together with the above, this +# negates much of the benefit of opaque closures. + +# TODO: support for constructing an opaque closure from source code + +# TODO: complete support for passing an environment. this probably requires a split into +# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray. + +struct OpaqueClosure{F, E, A, R} # func, env, args, ret + env::E +end + +function OpaqueClosure(ir::IRCode, @nospecialize env...; + slotnames::Union{Nothing,Vector{Symbol}}=nothing) + nargtypes = length(ir.argtypes) + nargs = nargtypes-1 + sig = compute_oc_signature(ir, nargs) + rt = compute_ir_rettype(ir) + src = make_oc_codeinfo(ir, env...; slotnames) + return create_static_oc(src, sig, rt, nargs, env...) +end + +function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs) + return create_static_oc(src, sig, rettype, nargs, env...) +end + +function create_static_oc(src, @nospecialize(sig), @nospecialize(rt), nargs::Int, + @nospecialize env...; file=nothing, line=0) + config = compiler_config(device(); kernel=false) + meth = make_oc_method(nargs; file, line) + + # look up a method instance and create a compiler job + full_sig = Tuple{typeof(env), sig.parameters...} + mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, + (Any, Any, Any), meth, full_sig, Core.svec()) + job = CompilerJob(mi, config, meth.primary_world) + + # create a callable object + id = length(GPUCompiler.deferred_codegen_jobs) + 1 + GPUCompiler.deferred_codegen_jobs[id] = job + oc = OpaqueClosure{id, typeof(env), sig, rt}(env) + + opaque_closure_jobs[job] = (; oc, src, rt) + return oc +end + +# device-side call (oc::OpaqueClosure)(args...) = call(oc, args...) ## NOTE: split into two to make `SciML.isinplace(oc)` work. ## it also resembles how kernels are called. @@ -597,3 +606,87 @@ end #ccall(ptr, R, (A...), args...) generated_ccall(ptr, R, A, args...) end + +# dynamic opaque closures + +const jit_opaque_closures = Dict() + +struct JITOpaqueClosure{B, T} + builder::B + tfunc::T + + function JITOpaqueClosure(builder, tfunc=Returns(nothing); nargs) + # the device and world are captured at closure construction time, but we only need + # them when creating the CompilerJob. as we cannot simply encode them in the + # JITOpaqueClosure object, we store them in a global dictionary instead. + config = compiler_config(device(); kernel=false) + meth = make_oc_method(nargs) + + # create a callable object + oc = new{typeof(builder), typeof(tfunc)}(builder, tfunc) + jit_opaque_closures[typeof(oc)] = (; env=(), meth, config, oc) + + return oc + end +end + +# device-side call +function (oc::JITOpaqueClosure)(args...) + rt = oc.tfunc(map(Core.Typeof, args)...) + call(oc, rt, args...) +end +@inline @generated function call(oct::JITOpaqueClosure{B,T}, ::Type{R}, args...) where {B,T,R} + rt = R + (; env, meth, config, oc) = jit_opaque_closures[oct] + + # look up a method instance and create a compiler job + full_sig = Tuple{typeof(env), args...} + mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, + (Any, Any, Any), meth, full_sig, Core.svec()) + job = CompilerJob(mi, config, meth.primary_world) + opaque_closure_jobs[job] = (; oc, args, rt) + + # generate a deferred compilation call + id = length(GPUCompiler.deferred_codegen_jobs) + 1 + GPUCompiler.deferred_codegen_jobs[id] = job + quote + ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id) + assume(ptr != C_NULL) + #ccall(ptr, R, (A...), args...) + generated_ccall(ptr, $rt, $(Tuple{args...}), args...) + end +end + +# compilation of opaque closures + +const opaque_closure_jobs = Dict{CompilerJob,Any}() + +function GPUCompiler.prepare_job!(@nospecialize(job::CUDACompilerJob)) + if haskey(opaque_closure_jobs, job) + rt = opaque_closure_jobs[job].rt + oc = opaque_closure_jobs[job].oc + if oc isa JITOpaqueClosure + args = opaque_closure_jobs[job].args + nargs = length(args) + + src = oc.builder(args...) + if src isa IRCode + nargtypes = length(src.argtypes) + nargs = nargtypes-1 + sig = compute_oc_signature(src, nargs) + @assert compute_ir_rettype(src) == rt "Inferred return type does not match the provided return type" + src = make_oc_codeinfo(src) + end + else + src = opaque_closure_jobs[job].src + end + @assert src isa CodeInfo + + # create a code instance and store it in the cache + interp = GPUCompiler.get_interpreter(job) + ci = make_oc_codeinstance(job.source, src; interp, job.world, rt) + Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, job.source) + end + + return +end diff --git a/test/core/execution.jl b/test/core/execution.jl index 3d23b97ca5..d0091adde1 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -1099,7 +1099,7 @@ end if VERSION >= v"1.12-" @testset "opaque closures" begin -# basic closure, constructed from IRCode +# static closure, constructed from IRCode let ir, rettyp = only(Base.code_ircode(+, (Int, Int))) oc = CUDA.OpaqueClosure(ir) @@ -1118,7 +1118,7 @@ let @test Array(c)[] == 3 end -# basic closure, constructed from CodeInfo +# static closure, constructed from CodeInfo let ir, rettype = only(Base.code_typed(*, (Int, Int, Int))) oc = CUDA.OpaqueClosure(ir; sig=Tuple{Int,Int,Int}, rettype, nargs=3) @@ -1138,6 +1138,43 @@ let @test Array(d)[] == 24 end +# dynamic closure, constructing IRCode based on argument types +let + tfunc(arg1, arg2) = Core.Compiler.return_type(+, Tuple{arg1,arg2}) + function builder(arg1, arg2) + ir, rettyp = only(Base.code_ircode(+, (arg1, arg2))) + return ir + end + + oc = CUDA.JITOpaqueClosure(builder, tfunc; nargs=2) + + function kernel(oc, c, a, b) + i = threadIdx().x + @inbounds c[i] = oc(a[i], b[i]) + return + end + + let + c = CuArray([0]) + a = CuArray([1]) + b = CuArray([2]) + + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 3 + end + + let + c = CuArray([3f0]) + a = CuArray([4f0]) + b = CuArray([5f0]) + + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 9f0 + end +end + end end