Closed
Description
Using the current CUDA.jl master (146ad00c0
) the following kernel works with Julia 1.7.2 but fails with Julia 1.8.0-beta1.
function kernel_wmma_int8_lowlevel(a_dev, b_dev, c_dev, d_dev)
a_frag = WMMA.llvm_wmma_load_a_col_m16n16k16_global_stride_s8(pointer(a_dev), 16)
b_frag = WMMA.llvm_wmma_load_b_col_m16n16k16_global_stride_s8(pointer(b_dev), 16)
c_frag = WMMA.llvm_wmma_load_c_col_m16n16k16_global_stride_s32(pointer(c_dev), 16)
c_frag = WMMA.llvm_wmma_mma_col_col_m16n16k16_s8(a_frag, b_frag, c_frag)
WMMA.llvm_wmma_store_d_col_m16n16k16_global_stride_s32(pointer(d_dev), c_frag, 16)
return nothing
end
function call_kernel()
m = n = 16
k = 16
dtype_a = dtype_b = Int8
dtype_a = dtype_b = Int8
dtype_a = dtype_b = Int8
dtype_c = dtype_d = Int32
d_a = CUDA.rand(dtype_a, m, k)
d_b = CUDA.rand(dtype_b, k, n)
d_c = CUDA.rand(dtype_c, m, n)
d_d = CUDA.zeros(dtype_d, m, n)
CUDA.@sync @cuda kernel_wmma_int8_lowlevel(d_a, d_b, d_c, d_d)
return nothing
end
Error message for Julia 1.8.0-beta1:
julia> call_kernel() [0/2762]
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:139 [inlined]
[4] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:325 [inlined]
[5] device_synchronize()
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:319
[6] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/initialization.jl:54
caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:139 [inlined]
[4] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:325 [inlined]
[5] device_synchronize()
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:319
[6] CuModule(data::Vector{UInt8}, options::Dict{CUDA.CUjit_option_enum, Any})
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/module.jl:41
[7] CuModule
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/module.jl:23 [inlined]
[8] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}})
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/compiler/execution.jl:451
[9] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/cache.jl:95
[10] cufunction(f::CUDA.var"#kernel#361", tt::Type{Tuple{CuDeviceMatrix{Int32, 1}, UInt32, UInt32}}; name::String, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/compiler/execution.jl:297
[11] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/compiler/execution.jl:102 [inlined]
[12] rand!(rng::CUDA.RNG, A::CuArray{Int32, 2, CUDA.Mem.DeviceBuffer})
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/random.jl:60
[13] rand!(A::CuArray{Int32, 2, CUDA.Mem.DeviceBuffer})
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/random.jl:259
[14] rand(T::Type, dim1::Int64, dims::Int64)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/random.jl:273
[15] call_kernel()
@ Main ./REPL[3]:9
[16] top-level scope
@ REPL[4]:1
[17] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/initialization.jl:52
Note that this is not specific to this very kernel. I've tested similar kernel for Float16 which also works just fine under 1.7.2 but fails with 1.8.0-beta1.
This might also be the reason why we see illegal memory access errors in #1426 and #1425. (It's curious though that #1419 work...)