From 4efbb17ebe18cf2ff75018cf9adc0171de84cf72 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 13 Feb 2025 13:33:27 +0100 Subject: [PATCH 01/11] Switch to the LLVM SPIR-V back-end. --- Project.toml | 8 +++++--- src/OpenCL.jl | 2 +- test/Project.toml | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index d6a2f870..5eeb86fb 100644 --- a/Project.toml +++ b/Project.toml @@ -14,13 +14,14 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" -SPIRV_LLVM_Translator_jll = "4a5d46fc-d8cf-5151-a261-86b458210efb" +SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c" +SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] Adapt = "4" GPUArrays = "11.2.1" -GPUCompiler = "0.27, 1" +GPUCompiler = "1.2" KernelAbstractions = "0.9.2" LLVM = "9.1" LinearAlgebra = "1" @@ -29,6 +30,7 @@ Printf = "1" Random = "1" Reexport = "1" SPIRVIntrinsics = "0.2" -SPIRV_LLVM_Translator_jll = "20" +SPIRV_LLVM_Backend_jll = "20" +SPIRV_Tools_jll = "2024.3.0" StaticArrays = "1" julia = "1.10" diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 3e05693b..23cc0abb 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -2,7 +2,7 @@ module OpenCL using GPUCompiler using LLVM, LLVM.Interop -using SPIRV_LLVM_Translator_jll +using SPIRV_LLVM_Backend_jll, SPIRV_Tools_jll using Adapt using Reexport using GPUArrays diff --git a/test/Project.toml b/test/Project.toml index fa38cc3a..2c0a9a9d 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -11,6 +11,7 @@ OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SPIRV_LLVM_Backend_jll = "4376b9bf-cff8-51b6-bb48-39421dff0d0c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" From 837e0b3f94ddc84b26189841afa4242b89ee60e2 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 13 Feb 2025 13:33:37 +0100 Subject: [PATCH 02/11] Port work item intrinsics. --- lib/intrinsics/src/work_item.jl | 78 +++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl index 1a059fe4..d4e66cb0 100644 --- a/lib/intrinsics/src/work_item.jl +++ b/lib/intrinsics/src/work_item.jl @@ -1,28 +1,60 @@ # Work-Item Functions -export get_work_dim, - get_global_size, get_global_id, - get_local_size, get_enqueued_local_size, get_local_id, - get_num_groups, get_group_id, - get_global_offset, - get_global_linear_id, get_local_linear_id - # NOTE: these functions now unsafely truncate to Int to avoid top bit checks. # we should probably use range metadata instead. -@device_function get_work_dim() = @builtin_ccall("get_work_dim", UInt32, ()) % Int - -@device_function get_global_size(dimindx::Integer = 1u32) = @builtin_ccall("get_global_size", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_global_id(dimindx::Integer = 1u32) = @builtin_ccall("get_global_id", UInt, (UInt32,), dimindx - 1u32) % Int + 1 - -@device_function get_local_size(dimindx::Integer = 1u32) = @builtin_ccall("get_local_size", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_enqueued_local_size(dimindx::Integer = 1u32) = @builtin_ccall("get_enqueued_local_size", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_local_id(dimindx::Integer = 1u32) = @builtin_ccall("get_local_id", UInt, (UInt32,), dimindx - 1u32) % Int + 1 - -@device_function get_num_groups(dimindx::Integer = 1u32) = @builtin_ccall("get_num_groups", UInt, (UInt32,), dimindx - 1u32) % Int -@device_function get_group_id(dimindx::Integer = 1u32) = @builtin_ccall("get_group_id", UInt, (UInt32,), dimindx - 1u32) % Int + 1 - -@device_function get_global_offset(dimindx::Integer = 1u32) = @builtin_ccall("get_global_offset", UInt, (UInt32,), dimindx - 1u32) % Int + 1 - -@device_function get_global_linear_id() = @builtin_ccall("get_global_linear_id", UInt, ()) % Int + 1 -@device_function get_local_linear_id() = @builtin_ccall("get_local_linear_id", UInt, ()) % Int + 1 +# 1D values +for (julia_name, (spirv_name, offset)) in [ + # indices + :get_global_linear_id => (:BuiltInGlobalLinearId, 1u32), + :get_local_linear_id => (:BuiltInLocalInvocationIndex, 1u32), + :get_sub_group_id => (:BuiltInSubgroupId, 1u32), + :get_sub_group_local_id => (:BuiltInSubgroupLocalInvocationId, 1u32), + # sizes + :get_work_dim => (:BuiltInWorkDim, 0u32), + :get_sub_group_size => (:BuiltInSubgroupSize, 0u32), + :get_max_sub_group_size => (:BuiltInSubgroupMaxSize, 0u32), + :get_num_sub_groups => (:BuiltInNumSubgroups, 0u32), + :get_enqueued_num_sub_groups => (:BuiltInNumEnqueuedSubgroups, 0u32)] + gvar_name = Symbol("@__spirv_$(spirv_name)") + @eval begin + export $julia_name + @device_function $julia_name() = + Base.llvmcall( + $("""$gvar_name = external addrspace(1) global i32 + define i32 @entry() #0 { + %val = load i32, i32 addrspace(1)* $gvar_name + ret i32 %val + } + attributes #0 = { alwaysinline } + """, "entry"), UInt32, Tuple{}) % Int + $offset + end +end + +# 3D values +for (julia_name, (spirv_name, offset)) in [ + # indices + :get_global_id => (:BuiltInGlobalInvocationId, 1u32), + :get_global_offset => (:BuiltInGlobalOffset, 1u32), + :get_local_id => (:BuiltInLocalInvocationId, 1u32), + :get_group_id => (:BuiltInWorkgroupId, 1u32), + # sizes + :get_global_size => (:BuiltInGlobalSize, 0u32), + :get_local_size => (:BuiltInWorkgroupSize, 0u32), + :get_enqueued_local_size => (:BuiltInEnqueuedWorkgroupSize, 0u32), + :get_num_groups => (:BuiltInNumWorkgroups, 0u32)] + gvar_name = Symbol("@__spirv_$(spirv_name)") + @eval begin + export $julia_name + @device_function $julia_name(dimindx::Integer=1) = + Base.llvmcall( + $("""$gvar_name = external addrspace(1) global <3 x i32> + define i32 @entry(i32 %idx) #0 { + %val = load <3 x i32>, <3 x i32> addrspace(1)* $gvar_name + %element = extractelement <3 x i32> %val, i32 %idx + ret i32 %element + } + attributes #0 = { alwaysinline } + """, "entry"), UInt32, Tuple{UInt32}, UInt32(dimindx - 1u32)) % Int + $offset + end +end From 78763827114d49c1e90821b90eb3a9bd18f9414e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 13 Feb 2025 14:19:06 +0100 Subject: [PATCH 03/11] Switch to SPIR-V address spaces. --- lib/intrinsics/src/atomic.jl | 2 +- lib/intrinsics/src/math.jl | 4 ++-- lib/intrinsics/src/memory.jl | 6 ++--- lib/intrinsics/src/pointer.jl | 19 ++++++++------- lib/intrinsics/src/printf.jl | 42 +++++++++++++++++++++++++++++++-- lib/intrinsics/src/utils.jl | 10 ++++---- lib/intrinsics/src/work_item.jl | 8 +++---- src/array.jl | 7 +++--- src/compiler/execution.jl | 2 +- test/execution.jl | 2 +- 10 files changed, 72 insertions(+), 30 deletions(-) diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl index 144026fe..a1c6007d 100644 --- a/lib/intrinsics/src/atomic.jl +++ b/lib/intrinsics/src/atomic.jl @@ -8,7 +8,7 @@ const atomic_integer_types = [UInt32, Int32] # TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS # TODO: additional floating-point atomics with ZE_extension_float_atomics -const atomic_memory_types = [AS.Local, AS.Global] +const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup] # generically typed diff --git a/lib/intrinsics/src/math.jl b/lib/intrinsics/src/math.jl index 1e4c2a97..d51d603b 100644 --- a/lib/intrinsics/src/math.jl +++ b/lib/intrinsics/src/math.jl @@ -94,8 +94,8 @@ for gentype in generic_types cosval = Ref{$gentype}() sinval = GC.@preserve cosval begin ptr = Base.unsafe_convert(Ptr{$gentype}, cosval) - llvm_ptr = reinterpret(LLVMPtr{$gentype, AS.Private}, ptr) - @builtin_ccall("sincos", $gentype, ($gentype, LLVMPtr{$gentype, AS.Private}), x, llvm_ptr) + llvm_ptr = reinterpret(LLVMPtr{$gentype, AS.Function}, ptr) + @builtin_ccall("sincos", $gentype, ($gentype, LLVMPtr{$gentype, AS.Function}), x, llvm_ptr) end return sinval, cosval[] end diff --git a/lib/intrinsics/src/memory.jl b/lib/intrinsics/src/memory.jl index d5ffe260..da5c5b2b 100644 --- a/lib/intrinsics/src/memory.jl +++ b/lib/intrinsics/src/memory.jl @@ -5,7 +5,7 @@ Context() do ctx # XXX: as long as LLVMPtr is emitted as i8*, it doesn't make sense to type the GV eltyp = convert(LLVMType, LLVM.Int8Type()) - T_ptr = convert(LLVMType, LLVMPtr{T,AS.Local}) + T_ptr = convert(LLVMType, LLVMPtr{T,AS.Workgroup}) # create a function llvm_f, _ = create_function(T_ptr) @@ -13,7 +13,7 @@ # create the global variable mod = LLVM.parent(llvm_f) gv_typ = LLVM.ArrayType(eltyp, len * sizeof(T)) - gv = GlobalVariable(mod, gv_typ, "local_memory", AS.Local) + gv = GlobalVariable(mod, gv_typ, "local_memory", AS.Workgroup) if len > 0 linkage!(gv, LLVM.API.LLVMInternalLinkage) initializer!(gv, null(gv_typ)) @@ -33,6 +33,6 @@ ret!(builder, untyped_ptr) end - call_function(llvm_f, LLVMPtr{T,AS.Local}) + call_function(llvm_f, LLVMPtr{T,AS.Workgroup}) end end diff --git a/lib/intrinsics/src/pointer.jl b/lib/intrinsics/src/pointer.jl index 228740e5..b84d2d15 100644 --- a/lib/intrinsics/src/pointer.jl +++ b/lib/intrinsics/src/pointer.jl @@ -4,13 +4,16 @@ export AS module AS -const Private = 0 -const Global = 1 -const Constant = 2 -const Local = 3 -const Generic = 4 -const Input = 5 -const Output = 6 -const Count = 7 +const Function = 0 +const CrossWorkgroup = 1 +const UniformConstant = 2 +const Workgroup = 3 +const Generic = 4 +const DeviceOnlyINTEL = 5 # XXX: should be CrossWorkgroup +const HostOnlyINTEL = 6 # when USM is not supported +const Input = 7 +const Output = 8 +const CodeSectionINTEL = 9 +const Private = 10 end diff --git a/lib/intrinsics/src/printf.jl b/lib/intrinsics/src/printf.jl index 9f04e13e..8de4f004 100644 --- a/lib/intrinsics/src/printf.jl +++ b/lib/intrinsics/src/printf.jl @@ -31,7 +31,7 @@ end Context() do ctx T_void = LLVM.VoidType() T_int32 = LLVM.Int32Type() - T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.Constant) + T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.UniformConstant) # create functions param_types = LLVMType[convert(LLVMType, typ) for typ in arg_types] @@ -80,7 +80,45 @@ end push!(actual_args, actual_arg) end - str = globalstring_ptr!(builder, String(fmt); addrspace=AS.Constant) + # `printf` needs to be invoked very specifically, e.g., the format string needs + # to be a pointer to a string, and arguments need to match exactly what is + # expected by the format string, so we cannot rely on how the arguments to this + # function have been passed in (by `llvmcall`). + T_actual_args = LLVMType[] + actual_args = LLVM.Value[] + for (_, (arg, argtyp)) in enumerate(zip(parameters(llvm_f), arg_types)) + if argtyp <: LLVMPtr + # passed as i8* + T,AS = argtyp.parameters + actual_typ = LLVM.PointerType(convert(LLVMType, T), AS) + actual_arg = bitcast!(builder, arg, actual_typ) + elseif argtyp <: Ptr + T = eltype(argtyp) + if T === Nothing + T = Int8 + end + actual_typ = LLVM.PointerType(convert(LLVMType, T)) + actual_arg = if value_type(arg) isa LLVM.PointerType + # passed as i8* or ptr + bitcast!(builder, arg, actual_typ) + else + # passed as i64 + inttoptr!(builder, arg, actual_typ) + end + elseif argtyp <: Bool + # passed as i8 + T = eltype(argtyp) + actual_typ = LLVM.Int1Type() + actual_arg = trunc!(builder, arg, actual_typ) + else + actual_typ = convert(LLVMType, argtyp) + actual_arg = arg + end + push!(T_actual_args, actual_typ) + push!(actual_args, actual_arg) + end + + str = globalstring_ptr!(builder, String(fmt); addrspace=AS.UniformConstant) # invoke printf and return printf_typ = LLVM.FunctionType(T_int32, [T_pint8]; vararg=true) diff --git a/lib/intrinsics/src/utils.jl b/lib/intrinsics/src/utils.jl index f62a21be..eae013c3 100644 --- a/lib/intrinsics/src/utils.jl +++ b/lib/intrinsics/src/utils.jl @@ -34,22 +34,22 @@ macro builtin_ccall(name, ret, argtypes, args...) elt, as = T.parameters # mangle address space - ASstr = if as == AS.Global + ASstr = if as == AS.CrossWorkgroup "CLglobal" #elseif as == AS.Global_device # "CLdevice" #elseif as == AS.Global_host # "CLhost" - elseif as == AS.Local + elseif as == AS.Workgroup "CLlocal" - elseif as == AS.Constant + elseif as == AS.UniformConstant "CLconstant" - elseif as == AS.Private + elseif as == AS.Function "CLprivate" elseif as == AS.Generic "CLgeneric" else - error("Unknown address space $AS") + error("Unknown address space $as") end # encode as vendor qualifier diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl index d4e66cb0..cf093b08 100644 --- a/lib/intrinsics/src/work_item.jl +++ b/lib/intrinsics/src/work_item.jl @@ -21,9 +21,9 @@ for (julia_name, (spirv_name, offset)) in [ export $julia_name @device_function $julia_name() = Base.llvmcall( - $("""$gvar_name = external addrspace(1) global i32 + $("""$gvar_name = external addrspace($(AS.Input)) global i32 define i32 @entry() #0 { - %val = load i32, i32 addrspace(1)* $gvar_name + %val = load i32, i32 addrspace($(AS.Input))* $gvar_name ret i32 %val } attributes #0 = { alwaysinline } @@ -48,9 +48,9 @@ for (julia_name, (spirv_name, offset)) in [ export $julia_name @device_function $julia_name(dimindx::Integer=1) = Base.llvmcall( - $("""$gvar_name = external addrspace(1) global <3 x i32> + $("""$gvar_name = external addrspace($(AS.Input)) global <3 x i32> define i32 @entry(i32 %idx) #0 { - %val = load <3 x i32>, <3 x i32> addrspace(1)* $gvar_name + %val = load <3 x i32>, <3 x i32> addrspace($(AS.Input))* $gvar_name %element = extractelement <3 x i32> %val, i32 %idx ret i32 %element } diff --git a/src/array.jl b/src/array.jl index 9f6d496d..fa8d1572 100644 --- a/src/array.jl +++ b/src/array.jl @@ -299,9 +299,10 @@ end ## interop with GPU arrays -function Base.unsafe_convert(::Type{CLDeviceArray{T, N, AS.Global}}, a::CLArray{T, N}) where {T, N} - return CLDeviceArray{T, N, AS.Global}( - size(a), reinterpret(LLVMPtr{T, AS.Global}, pointer(a)), +function Base.unsafe_convert(::Type{CLDeviceArray{T, N, AS.CrossWorkgroup}}, + a::CLArray{T, N}) where {T, N} + return CLDeviceArray{T, N, AS.CrossWorkgroup}( + size(a), reinterpret(LLVMPtr{T, AS.CrossWorkgroup}, pointer(a)), a.maxsize - a.offset * Base.elsize(a) ) end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 946f46e3..881ea906 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -92,7 +92,7 @@ function Adapt.adapt_storage(to::KernelAdaptor, buf::cl.AbstractMemory) end function Adapt.adapt_storage(to::KernelAdaptor, arr::CLArray{T, N}) where {T, N} push!(to.indirect_memory, arr.data[].mem) - return Base.unsafe_convert(CLDeviceArray{T, N, AS.Global}, arr) + return Base.unsafe_convert(CLDeviceArray{T, N, AS.CrossWorkgroup}, arr) end # Base.RefValue isn't GPU compatible, so provide a compatible alternative diff --git a/test/execution.jl b/test/execution.jl index ce7e0106..0936d0df 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -93,7 +93,7 @@ end @test OpenCL.return_type(identity, Tuple{Int}) === Int @test OpenCL.return_type(sin, Tuple{Float32}) === Float32 - @test OpenCL.return_type(getindex, Tuple{CLDeviceArray{Float32,1,AS.Global},Int32}) === Float32 + @test OpenCL.return_type(getindex, Tuple{CLDeviceArray{Float32,1,AS.CrossWorkgroup},Int32}) === Float32 @test OpenCL.return_type(getindex, Tuple{Base.RefValue{Integer}}) === Integer end From 507d5296c81905791bfb2683cb7221af19326550 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 13 Feb 2025 14:54:24 +0100 Subject: [PATCH 04/11] Switch work-item intrinsics to native integer width. --- lib/intrinsics/src/work_item.jl | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl index cf093b08..ee94a7d3 100644 --- a/lib/intrinsics/src/work_item.jl +++ b/lib/intrinsics/src/work_item.jl @@ -17,17 +17,18 @@ for (julia_name, (spirv_name, offset)) in [ :get_num_sub_groups => (:BuiltInNumSubgroups, 0u32), :get_enqueued_num_sub_groups => (:BuiltInNumEnqueuedSubgroups, 0u32)] gvar_name = Symbol("@__spirv_$(spirv_name)") + width = Int === Int64 ? 64 : 32 @eval begin export $julia_name @device_function $julia_name() = Base.llvmcall( - $("""$gvar_name = external addrspace($(AS.Input)) global i32 - define i32 @entry() #0 { - %val = load i32, i32 addrspace($(AS.Input))* $gvar_name - ret i32 %val + $("""$gvar_name = external addrspace($(AS.Input)) global i$(width) + define i$(width) @entry() #0 { + %val = load i$(width), i$(width) addrspace($(AS.Input))* $gvar_name + ret i$(width) %val } attributes #0 = { alwaysinline } - """, "entry"), UInt32, Tuple{}) % Int + $offset + """, "entry"), UInt, Tuple{}) % Int + $offset end end @@ -44,17 +45,18 @@ for (julia_name, (spirv_name, offset)) in [ :get_enqueued_local_size => (:BuiltInEnqueuedWorkgroupSize, 0u32), :get_num_groups => (:BuiltInNumWorkgroups, 0u32)] gvar_name = Symbol("@__spirv_$(spirv_name)") + width = Int === Int64 ? 64 : 32 @eval begin export $julia_name @device_function $julia_name(dimindx::Integer=1) = Base.llvmcall( - $("""$gvar_name = external addrspace($(AS.Input)) global <3 x i32> - define i32 @entry(i32 %idx) #0 { - %val = load <3 x i32>, <3 x i32> addrspace($(AS.Input))* $gvar_name - %element = extractelement <3 x i32> %val, i32 %idx - ret i32 %element + $("""$gvar_name = external addrspace($(AS.Input)) global <3 x i$(width)> + define i$(width) @entry(i$(width) %idx) #0 { + %val = load <3 x i$(width)>, <3 x i$(width)> addrspace($(AS.Input))* $gvar_name + %element = extractelement <3 x i$(width)> %val, i$(width) %idx + ret i$(width) %element } attributes #0 = { alwaysinline } - """, "entry"), UInt32, Tuple{UInt32}, UInt32(dimindx - 1u32)) % Int + $offset + """, "entry"), UInt64, Tuple{UInt64}, UInt64(dimindx - 1u32)) % Int + $offset end end From 3918d8a9230605f7d69c238887e2f42338fa581d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 13 Feb 2025 15:08:50 +0100 Subject: [PATCH 05/11] Remove unneeded test dep. --- test/Project.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/test/Project.toml b/test/Project.toml index 2c0a9a9d..0d5bc3d1 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -7,7 +7,6 @@ IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" From b69b2c530ba0e9c441bda742be763a73a4f9856a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 13 Feb 2025 15:17:38 +0100 Subject: [PATCH 06/11] Fix definition. --- lib/intrinsics/src/work_item.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl index ee94a7d3..8168ef33 100644 --- a/lib/intrinsics/src/work_item.jl +++ b/lib/intrinsics/src/work_item.jl @@ -57,6 +57,6 @@ for (julia_name, (spirv_name, offset)) in [ ret i$(width) %element } attributes #0 = { alwaysinline } - """, "entry"), UInt64, Tuple{UInt64}, UInt64(dimindx - 1u32)) % Int + $offset + """, "entry"), UInt, Tuple{UInt}, UInt(dimindx - 1u32)) % Int + $offset end end From 0e6ed99a750b30b474da1fbd2c66fbe82df24192 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 14 Feb 2025 07:15:12 +0100 Subject: [PATCH 07/11] Conform more strictly to the SPIR-V/OpenCL spec. --- lib/intrinsics/src/work_item.jl | 44 +++++++++++++++++---------------- src/compiler/compilation.jl | 2 +- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/lib/intrinsics/src/work_item.jl b/lib/intrinsics/src/work_item.jl index 8168ef33..bbe85adb 100644 --- a/lib/intrinsics/src/work_item.jl +++ b/lib/intrinsics/src/work_item.jl @@ -1,23 +1,25 @@ # Work-Item Functions +# +# https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Env.html#_built_in_variables # NOTE: these functions now unsafely truncate to Int to avoid top bit checks. # we should probably use range metadata instead. # 1D values -for (julia_name, (spirv_name, offset)) in [ +for (julia_name, (spirv_name, julia_type, offset)) in [ # indices - :get_global_linear_id => (:BuiltInGlobalLinearId, 1u32), - :get_local_linear_id => (:BuiltInLocalInvocationIndex, 1u32), - :get_sub_group_id => (:BuiltInSubgroupId, 1u32), - :get_sub_group_local_id => (:BuiltInSubgroupLocalInvocationId, 1u32), + :get_global_linear_id => (:BuiltInGlobalLinearId, Csize_t, 1), + :get_local_linear_id => (:BuiltInLocalInvocationIndex, Csize_t, 1), + :get_sub_group_id => (:BuiltInSubgroupId, UInt32, 1), + :get_sub_group_local_id => (:BuiltInSubgroupLocalInvocationId, UInt32, 1), # sizes - :get_work_dim => (:BuiltInWorkDim, 0u32), - :get_sub_group_size => (:BuiltInSubgroupSize, 0u32), - :get_max_sub_group_size => (:BuiltInSubgroupMaxSize, 0u32), - :get_num_sub_groups => (:BuiltInNumSubgroups, 0u32), - :get_enqueued_num_sub_groups => (:BuiltInNumEnqueuedSubgroups, 0u32)] + :get_work_dim => (:BuiltInWorkDim, UInt32, 0), + :get_sub_group_size => (:BuiltInSubgroupSize, UInt32, 0), + :get_max_sub_group_size => (:BuiltInSubgroupMaxSize, UInt32, 0), + :get_num_sub_groups => (:BuiltInNumSubgroups, UInt32, 0), + :get_enqueued_num_sub_groups => (:BuiltInNumEnqueuedSubgroups, UInt32, 0)] gvar_name = Symbol("@__spirv_$(spirv_name)") - width = Int === Int64 ? 64 : 32 + width = sizeof(julia_type) * 8 @eval begin export $julia_name @device_function $julia_name() = @@ -28,27 +30,27 @@ for (julia_name, (spirv_name, offset)) in [ ret i$(width) %val } attributes #0 = { alwaysinline } - """, "entry"), UInt, Tuple{}) % Int + $offset + """, "entry"), $julia_type, Tuple{}) % Int + $offset end end # 3D values for (julia_name, (spirv_name, offset)) in [ # indices - :get_global_id => (:BuiltInGlobalInvocationId, 1u32), - :get_global_offset => (:BuiltInGlobalOffset, 1u32), - :get_local_id => (:BuiltInLocalInvocationId, 1u32), - :get_group_id => (:BuiltInWorkgroupId, 1u32), + :get_global_id => (:BuiltInGlobalInvocationId, 1), + :get_global_offset => (:BuiltInGlobalOffset, 1), + :get_local_id => (:BuiltInLocalInvocationId, 1), + :get_group_id => (:BuiltInWorkgroupId, 1), # sizes - :get_global_size => (:BuiltInGlobalSize, 0u32), - :get_local_size => (:BuiltInWorkgroupSize, 0u32), - :get_enqueued_local_size => (:BuiltInEnqueuedWorkgroupSize, 0u32), - :get_num_groups => (:BuiltInNumWorkgroups, 0u32)] + :get_global_size => (:BuiltInGlobalSize, 0), + :get_local_size => (:BuiltInWorkgroupSize, 0), + :get_enqueued_local_size => (:BuiltInEnqueuedWorkgroupSize, 0), + :get_num_groups => (:BuiltInNumWorkgroups, 0)] gvar_name = Symbol("@__spirv_$(spirv_name)") width = Int === Int64 ? 64 : 32 @eval begin export $julia_name - @device_function $julia_name(dimindx::Integer=1) = + @device_function $julia_name(dimindx::Integer=1u32) = Base.llvmcall( $("""$gvar_name = external addrspace($(AS.Input)) global <3 x i$(width)> define i$(width) @entry(i$(width) %idx) #0 { diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index c95bae67..49c50e04 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -47,7 +47,7 @@ end supports_fp64 = "cl_khr_fp64" in dev.extensions # create GPUCompiler objects - target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) + target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true, kwargs...) params = OpenCLCompilerParams() CompilerConfig(target, params; kernel, name, always_inline) end From ff75a7b74f00cea1da03899cdf2f0b2f06c7ebee Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 14 Feb 2025 13:46:14 +0100 Subject: [PATCH 08/11] Bump SPIRV Tools for validation fixes. --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 5eeb86fb..b3c4672d 100644 --- a/Project.toml +++ b/Project.toml @@ -31,6 +31,6 @@ Random = "1" Reexport = "1" SPIRVIntrinsics = "0.2" SPIRV_LLVM_Backend_jll = "20" -SPIRV_Tools_jll = "2024.3.0" +SPIRV_Tools_jll = "2024.4" StaticArrays = "1" julia = "1.10" From 3990e9bc8d4e7db14838326077951c04184f3cbc Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 14 Feb 2025 13:55:15 +0100 Subject: [PATCH 09/11] Bump intrinsics version. --- Project.toml | 2 +- lib/intrinsics/Project.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index b3c4672d..566a41f7 100644 --- a/Project.toml +++ b/Project.toml @@ -29,7 +29,7 @@ OpenCL_jll = "=2024.10.24" Printf = "1" Random = "1" Reexport = "1" -SPIRVIntrinsics = "0.2" +SPIRVIntrinsics = "0.3" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2024.4" StaticArrays = "1" diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml index 75302449..262862f0 100644 --- a/lib/intrinsics/Project.toml +++ b/lib/intrinsics/Project.toml @@ -1,7 +1,7 @@ name = "SPIRVIntrinsics" uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" authors = ["Tim Besard "] -version = "0.2.1" +version = "0.3.0" [deps] ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" From c488e6e130c49462c4a430e9af81618dc68aca7d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 20 May 2025 09:37:05 +0200 Subject: [PATCH 10/11] Update utils. --- src/util.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.jl b/src/util.jl index 28a22ff2..85baf052 100644 --- a/src/util.jl +++ b/src/util.jl @@ -38,7 +38,7 @@ function versioninfo(io::IO=stdout) println(io, "Toolchain:") println(io, " - Julia v$(VERSION)") - for jll in [cl.OpenCL_jll, SPIRV_LLVM_Translator_jll] + for jll in [cl.OpenCL_jll, SPIRV_LLVM_Backend_jll] name = string(jll) println(io, " - $(name[1:end-4]): $(pkgversion(jll))") end From a85c23529ec18b5514fa5962e4321e2bcc092822 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 10 Jun 2025 13:39:16 +0200 Subject: [PATCH 11/11] Remove duplicate code. --- lib/intrinsics/src/printf.jl | 38 ------------------------------------ 1 file changed, 38 deletions(-) diff --git a/lib/intrinsics/src/printf.jl b/lib/intrinsics/src/printf.jl index 8de4f004..203a966d 100644 --- a/lib/intrinsics/src/printf.jl +++ b/lib/intrinsics/src/printf.jl @@ -80,44 +80,6 @@ end push!(actual_args, actual_arg) end - # `printf` needs to be invoked very specifically, e.g., the format string needs - # to be a pointer to a string, and arguments need to match exactly what is - # expected by the format string, so we cannot rely on how the arguments to this - # function have been passed in (by `llvmcall`). - T_actual_args = LLVMType[] - actual_args = LLVM.Value[] - for (_, (arg, argtyp)) in enumerate(zip(parameters(llvm_f), arg_types)) - if argtyp <: LLVMPtr - # passed as i8* - T,AS = argtyp.parameters - actual_typ = LLVM.PointerType(convert(LLVMType, T), AS) - actual_arg = bitcast!(builder, arg, actual_typ) - elseif argtyp <: Ptr - T = eltype(argtyp) - if T === Nothing - T = Int8 - end - actual_typ = LLVM.PointerType(convert(LLVMType, T)) - actual_arg = if value_type(arg) isa LLVM.PointerType - # passed as i8* or ptr - bitcast!(builder, arg, actual_typ) - else - # passed as i64 - inttoptr!(builder, arg, actual_typ) - end - elseif argtyp <: Bool - # passed as i8 - T = eltype(argtyp) - actual_typ = LLVM.Int1Type() - actual_arg = trunc!(builder, arg, actual_typ) - else - actual_typ = convert(LLVMType, argtyp) - actual_arg = arg - end - push!(T_actual_args, actual_typ) - push!(actual_args, actual_arg) - end - str = globalstring_ptr!(builder, String(fmt); addrspace=AS.UniformConstant) # invoke printf and return