diff --git a/Project.toml b/Project.toml index d6a2f870..566a41f7 100644 --- a/Project.toml +++ b/Project.toml @@ -14,13 +14,14 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" -SPIRV_LLVM_Translator_jll = "4a5d46fc-d8cf-5151-a261-86b458210efb" +SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c" +SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] Adapt = "4" GPUArrays = "11.2.1" -GPUCompiler = "0.27, 1" +GPUCompiler = "1.2" KernelAbstractions = "0.9.2" LLVM = "9.1" LinearAlgebra = "1" @@ -28,7 +29,8 @@ OpenCL_jll = "=2024.10.24" Printf = "1" Random = "1" Reexport = "1" -SPIRVIntrinsics = "0.2" -SPIRV_LLVM_Translator_jll = "20" +SPIRVIntrinsics = "0.3" +SPIRV_LLVM_Backend_jll = "20" +SPIRV_Tools_jll = "2024.4" StaticArrays = "1" julia = "1.10" diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml index 75302449..262862f0 100644 --- a/lib/intrinsics/Project.toml +++ b/lib/intrinsics/Project.toml @@ -1,7 +1,7 @@ name = "SPIRVIntrinsics" uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" authors = ["Tim Besard "] -version = "0.2.1" +version = "0.3.0" [deps] ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl index 144026fe..a1c6007d 100644 --- a/lib/intrinsics/src/atomic.jl +++ b/lib/intrinsics/src/atomic.jl @@ -8,7 +8,7 @@ const atomic_integer_types = [UInt32, Int32] # TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS # TODO: additional floating-point atomics with ZE_extension_float_atomics -const atomic_memory_types = [AS.Local, AS.Global] +const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup] # generically typed diff --git a/lib/intrinsics/src/math.jl b/lib/intrinsics/src/math.jl index 1e4c2a97..d51d603b 100644 --- a/lib/intrinsics/src/math.jl +++ b/lib/intrinsics/src/math.jl @@ -94,8 +94,8 @@ for gentype in generic_types cosval = Ref{$gentype}() sinval = GC.@preserve cosval begin ptr = Base.unsafe_convert(Ptr{$gentype}, cosval) - llvm_ptr = reinterpret(LLVMPtr{$gentype, AS.Private}, ptr) - @builtin_ccall("sincos", $gentype, ($gentype, LLVMPtr{$gentype, AS.Private}), x, llvm_ptr) + llvm_ptr = reinterpret(LLVMPtr{$gentype, AS.Function}, ptr) + @builtin_ccall("sincos", $gentype, ($gentype, LLVMPtr{$gentype, AS.Function}), x, llvm_ptr) end return sinval, cosval[] end diff --git a/lib/intrinsics/src/memory.jl b/lib/intrinsics/src/memory.jl index d5ffe260..da5c5b2b 100644 --- a/lib/intrinsics/src/memory.jl +++ b/lib/intrinsics/src/memory.jl @@ -5,7 +5,7 @@ Context() do ctx # XXX: as long as LLVMPtr is emitted as i8*, it doesn't make sense to type the GV eltyp = convert(LLVMType, LLVM.Int8Type()) - T_ptr = convert(LLVMType, LLVMPtr{T,AS.Local}) + T_ptr = convert(LLVMType, LLVMPtr{T,AS.Workgroup}) # create a function llvm_f, _ = create_function(T_ptr) @@ -13,7 +13,7 @@ # create the global variable mod = LLVM.parent(llvm_f) gv_typ = LLVM.ArrayType(eltyp, len * sizeof(T)) - gv = GlobalVariable(mod, gv_typ, "local_memory", AS.Local) + gv = GlobalVariable(mod, gv_typ, "local_memory", AS.Workgroup) if len > 0 linkage!(gv, LLVM.API.LLVMInternalLinkage) initializer!(gv, null(gv_typ)) @@ -33,6 +33,6 @@ ret!(builder, untyped_ptr) end - call_function(llvm_f, LLVMPtr{T,AS.Local}) + call_function(llvm_f, LLVMPtr{T,AS.Workgroup}) end end diff --git a/lib/intrinsics/src/pointer.jl b/lib/intrinsics/src/pointer.jl index 228740e5..b84d2d15 100644 --- a/lib/intrinsics/src/pointer.jl +++ b/lib/intrinsics/src/pointer.jl @@ -4,13 +4,16 @@ export AS module AS -const Private = 0 -const Global = 1 -const Constant = 2 -const Local = 3 -const Generic = 4 -const Input = 5 -const Output = 6 -const Count = 7 +const Function = 0 +const CrossWorkgroup = 1 +const UniformConstant = 2 +const Workgroup = 3 +const Generic = 4 +const DeviceOnlyINTEL = 5 # XXX: should be CrossWorkgroup +const HostOnlyINTEL = 6 # when USM is not supported +const Input = 7 +const Output = 8 +const CodeSectionINTEL = 9 +const Private = 10 end diff --git a/lib/intrinsics/src/printf.jl b/lib/intrinsics/src/printf.jl index 9f04e13e..203a966d 100644 --- a/lib/intrinsics/src/printf.jl +++ b/lib/intrinsics/src/printf.jl @@ -31,7 +31,7 @@ end Context() do ctx T_void = LLVM.VoidType() T_int32 = LLVM.Int32Type() - T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.Constant) + T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.UniformConstant) # create functions param_types = LLVMType[convert(LLVMType, typ) for typ in arg_types] @@ -80,7 +80,7 @@ end push!(actual_args, actual_arg) end - str = globalstring_ptr!(builder, String(fmt); addrspace=AS.Constant) + str = globalstring_ptr!(builder, String(fmt); addrspace=AS.UniformConstant) # invoke printf and return printf_typ = LLVM.FunctionType(T_int32, [T_pint8]; vararg=true) diff --git a/lib/intrinsics/src/utils.jl b/lib/intrinsics/src/utils.jl index f62a21be..eae013c3 100644 --- a/lib/intrinsics/src/utils.jl +++ b/lib/intrinsics/src/utils.jl @@ -34,22 +34,22 @@ macro builtin_ccall(name, ret, argtypes, args...) elt, as = T.parameters # mangle address space - ASstr = if as == AS.Global + ASstr = if as == AS.CrossWorkgroup "CLglobal" #elseif as == AS.Global_device # "CLdevice" #elseif as == AS.Global_host # "CLhost" - elseif as == AS.Local + elseif as == AS.Workgroup "CLlocal" - elseif as == AS.Constant + elseif as == AS.UniformConstant "CLconstant" - elseif as == AS.Private + elseif as == AS.Function "CLprivate" elseif as == AS.Generic "CLgeneric" else - error("Unknown address space $AS") + error("Unknown address space $as") end # encode as vendor qualifier diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl index 1a059fe4..bbe85adb 100644 --- a/lib/intrinsics/src/work_item.jl +++ b/lib/intrinsics/src/work_item.jl @@ -1,28 +1,64 @@ # Work-Item Functions - -export get_work_dim, - get_global_size, get_global_id, - get_local_size, get_enqueued_local_size, get_local_id, - get_num_groups, get_group_id, - get_global_offset, - get_global_linear_id, get_local_linear_id +# +# https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Env.html#_built_in_variables # NOTE: these functions now unsafely truncate to Int to avoid top bit checks. # we should probably use range metadata instead. -@device_function get_work_dim() = @builtin_ccall("get_work_dim", UInt32, ()) % Int - -@device_function get_global_size(dimindx::Integer = 1u32) = @builtin_ccall("get_global_size", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_global_id(dimindx::Integer = 1u32) = @builtin_ccall("get_global_id", UInt, (UInt32,), dimindx - 1u32) % Int + 1 - -@device_function get_local_size(dimindx::Integer = 1u32) = @builtin_ccall("get_local_size", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_enqueued_local_size(dimindx::Integer = 1u32) = @builtin_ccall("get_enqueued_local_size", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_local_id(dimindx::Integer = 1u32) = @builtin_ccall("get_local_id", UInt, (UInt32,), dimindx - 1u32) % Int + 1 - -@device_function get_num_groups(dimindx::Integer = 1u32) = @builtin_ccall("get_num_groups", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_group_id(dimindx::Integer = 1u32) = @builtin_ccall("get_group_id", UInt, (UInt32,), dimindx - 1u32) % Int + 1 - -@device_function get_global_offset(dimindx::Integer = 1u32) = @builtin_ccall("get_global_offset", UInt, (UInt32,), dimindx - 1u32) % Int + 1 +# 1D values +for (julia_name, (spirv_name, julia_type, offset)) in [ + # indices + :get_global_linear_id => (:BuiltInGlobalLinearId, Csize_t, 1), + :get_local_linear_id => (:BuiltInLocalInvocationIndex, Csize_t, 1), + :get_sub_group_id => (:BuiltInSubgroupId, UInt32, 1), + :get_sub_group_local_id => (:BuiltInSubgroupLocalInvocationId, UInt32, 1), + # sizes + :get_work_dim => (:BuiltInWorkDim, UInt32, 0), + :get_sub_group_size => (:BuiltInSubgroupSize, UInt32, 0), + :get_max_sub_group_size => (:BuiltInSubgroupMaxSize, UInt32, 0), + :get_num_sub_groups => (:BuiltInNumSubgroups, UInt32, 0), + :get_enqueued_num_sub_groups => (:BuiltInNumEnqueuedSubgroups, UInt32, 0)] + gvar_name = Symbol("@__spirv_$(spirv_name)") + width = sizeof(julia_type) * 8 + @eval begin + export $julia_name + @device_function $julia_name() = + Base.llvmcall( + $("""$gvar_name = external addrspace($(AS.Input)) global i$(width) + define i$(width) @entry() #0 { + %val = load i$(width), i$(width) addrspace($(AS.Input))* $gvar_name + ret i$(width) %val + } + attributes #0 = { alwaysinline } + """, "entry"), $julia_type, Tuple{}) % Int + $offset + end +end -@device_function get_global_linear_id() = @builtin_ccall("get_global_linear_id", UInt, ()) % Int + 1 -@device_function get_local_linear_id() = @builtin_ccall("get_local_linear_id", UInt, ()) % Int + 1 +# 3D values +for (julia_name, (spirv_name, offset)) in [ + # indices + :get_global_id => (:BuiltInGlobalInvocationId, 1), + :get_global_offset => (:BuiltInGlobalOffset, 1), + :get_local_id => (:BuiltInLocalInvocationId, 1), + :get_group_id => (:BuiltInWorkgroupId, 1), + # sizes + :get_global_size => (:BuiltInGlobalSize, 0), + :get_local_size => (:BuiltInWorkgroupSize, 0), + :get_enqueued_local_size => (:BuiltInEnqueuedWorkgroupSize, 0), + :get_num_groups => (:BuiltInNumWorkgroups, 0)] + gvar_name = Symbol("@__spirv_$(spirv_name)") + width = Int === Int64 ? 64 : 32 + @eval begin + export $julia_name + @device_function $julia_name(dimindx::Integer=1u32) = + Base.llvmcall( + $("""$gvar_name = external addrspace($(AS.Input)) global <3 x i$(width)> + define i$(width) @entry(i$(width) %idx) #0 { + %val = load <3 x i$(width)>, <3 x i$(width)> addrspace($(AS.Input))* $gvar_name + %element = extractelement <3 x i$(width)> %val, i$(width) %idx + ret i$(width) %element + } + attributes #0 = { alwaysinline } + """, "entry"), UInt, Tuple{UInt}, UInt(dimindx - 1u32)) % Int + $offset + end +end diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 3e05693b..23cc0abb 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -2,7 +2,7 @@ module OpenCL using GPUCompiler using LLVM, LLVM.Interop -using SPIRV_LLVM_Translator_jll +using SPIRV_LLVM_Backend_jll, SPIRV_Tools_jll using Adapt using Reexport using GPUArrays diff --git a/src/array.jl b/src/array.jl index 9f6d496d..fa8d1572 100644 --- a/src/array.jl +++ b/src/array.jl @@ -299,9 +299,10 @@ end ## interop with GPU arrays -function Base.unsafe_convert(::Type{CLDeviceArray{T, N, AS.Global}}, a::CLArray{T, N}) where {T, N} - return CLDeviceArray{T, N, AS.Global}( - size(a), reinterpret(LLVMPtr{T, AS.Global}, pointer(a)), +function Base.unsafe_convert(::Type{CLDeviceArray{T, N, AS.CrossWorkgroup}}, + a::CLArray{T, N}) where {T, N} + return CLDeviceArray{T, N, AS.CrossWorkgroup}( + size(a), reinterpret(LLVMPtr{T, AS.CrossWorkgroup}, pointer(a)), a.maxsize - a.offset * Base.elsize(a) ) end diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index c95bae67..49c50e04 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -47,7 +47,7 @@ end supports_fp64 = "cl_khr_fp64" in dev.extensions # create GPUCompiler objects - target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) + target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...) params = OpenCLCompilerParams() CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 946f46e3..881ea906 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -92,7 +92,7 @@ function Adapt.adapt_storage(to::KernelAdaptor, buf::cl.AbstractMemory) end function Adapt.adapt_storage(to::KernelAdaptor, arr::CLArray{T, N}) where {T, N} push!(to.indirect_memory, arr.data[].mem) - return Base.unsafe_convert(CLDeviceArray{T, N, AS.Global}, arr) + return Base.unsafe_convert(CLDeviceArray{T, N, AS.CrossWorkgroup}, arr) end # Base.RefValue isn't GPU compatible, so provide a compatible alternative diff --git a/src/util.jl b/src/util.jl index 28a22ff2..85baf052 100644 --- a/src/util.jl +++ b/src/util.jl @@ -38,7 +38,7 @@ function versioninfo(io::IO=stdout) println(io, "Toolchain:") println(io, " - Julia v$(VERSION)") - for jll in [cl.OpenCL_jll, SPIRV_LLVM_Translator_jll] + for jll in [cl.OpenCL_jll, SPIRV_LLVM_Backend_jll] name = string(jll) println(io, " - $(name[1:end-4]): $(pkgversion(jll))") end diff --git a/test/Project.toml b/test/Project.toml index fa38cc3a..0d5bc3d1 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -7,10 +7,10 @@ IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" diff --git a/test/execution.jl b/test/execution.jl index ce7e0106..0936d0df 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -93,7 +93,7 @@ end @test OpenCL.return_type(identity, Tuple{Int}) === Int @test OpenCL.return_type(sin, Tuple{Float32}) === Float32 - @test OpenCL.return_type(getindex, Tuple{CLDeviceArray{Float32,1,AS.Global},Int32}) === Float32 + @test OpenCL.return_type(getindex, Tuple{CLDeviceArray{Float32,1,AS.CrossWorkgroup},Int32}) === Float32 @test OpenCL.return_type(getindex, Tuple{Base.RefValue{Integer}}) === Integer end