Skip to content

PTX: LLVM assertion error during function merging #461

Closed
@maleadt

Description

@maleadt

After #444, running the CUDA.jl test suite with LLVM assertions throws:

opt: /workspace/srcdir/llvm-project/llvm/lib/IR/Operator.cpp:91: bool llvm::GEPOperator::accumulateConstantOffset(const llvm::DataLayout&, llvm::APInt&, llvm::function_ref<bool(llvm::Value&, llvm::APInt&)>) const: Assertion `Offset.getBitWidth() == DL.getIndexSizeInBits(getPointerAddressSpace()) && "The offset bit width does not match DL specification."' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /home/tim/Julia/src/julia/build/dev/usr/tools/opt wip.ll --mergefunc -o /dev/null
 #0 0x00007f09df31871f PrintStackTraceSignalHandler(void*) Signals.cpp:0:0
 #1 0x00007f09df31623c SignalHandler(int) Signals.cpp:0:0
 #2 0x00007f09de24fab0 (/usr/lib/libc.so.6+0x39ab0)
 #3 0x00007f09de29f26c (/usr/lib/libc.so.6+0x8926c)
 #4 0x00007f09de24fa08 raise (/usr/lib/libc.so.6+0x39a08)
 #5 0x00007f09de238538 abort (/usr/lib/libc.so.6+0x22538)
 #6 0x00007f09de23845c (/usr/lib/libc.so.6+0x2245c)
 #7 0x00007f09de2483d6 (/usr/lib/libc.so.6+0x323d6)
 #8 0x00007f09df4c1571 llvm::GEPOperator::accumulateConstantOffset(llvm::DataLayout const&, llvm::APInt&, llvm::function_ref<bool (llvm::Value&, llvm::APInt&)>) const (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0xcc1571)
 #9 0x00007f09e0116a8c llvm::FunctionComparator::cmpGEPs(llvm::GEPOperator const*, llvm::GEPOperator const*) const (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x1916a8c)
#10 0x00007f09e011712c llvm::FunctionComparator::cmpOperations(llvm::Instruction const*, llvm::Instruction const*, bool&) const (.part.397) FunctionComparator.cpp:0:0
#11 0x00007f09e0117db0 llvm::FunctionComparator::cmpBasicBlocks(llvm::BasicBlock const*, llvm::BasicBlock const*) const (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x1917db0)
#12 0x00007f09e01193ce llvm::FunctionComparator::compare() (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x19193ce)
#13 0x00007f09e0859e56 (anonymous namespace)::MergeFunctions::runOnModule(llvm::Module&) MergeFunctions.cpp:0:0
#14 0x00007f09e085c12c llvm::MergeFunctionsPass::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x205c12c)
#15 0x00007f09e2084f5d llvm::detail::PassModel<llvm::Module, llvm::MergeFunctionsPass, llvm::PreservedAnalyses, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0x3884f5d)
#16 0x00007f09df4cdeb2 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/tim/Julia/src/julia/build/dev/usr/tools/../lib/libLLVM-15jl.so+0xccdeb2)
#17 0x0000000000426e70 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::StringRef>, llvm::ArrayRef<llvm::PassPlugin>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool) (/home/tim/Julia/src/julia/build/dev/usr/tools/opt+0x426e70)
#18 0x000000000041a703 main (/home/tim/Julia/src/julia/build/dev/usr/tools/opt+0x41a703)
#19 0x00007f09de239850 (/usr/lib/libc.so.6+0x23850)
#20 0x00007f09de23990a __libc_start_main (/usr/lib/libc.so.6+0x2390a)
#21 0x000000000041a992 _start /workspace/srcdir/glibc-2.12.2/csu/../sysdeps/x86_64/elf/start.S:116:0
zsh: IOT instruction  /home/tim/Julia/src/julia/build/dev/usr/tools/opt wip.ll --mergefunc -o

MWE:

using CUDA

function kernel_a(x::Bool)
    @cuprint("a ")
    @cuda dynamic=true kernel_b(x)
    return
end

function kernel_b(x::Bool)
    @cuprint("b ")
    @cuda dynamic=true kernel_c(x)
    return
end

function kernel_c(x::Bool)
    @cuprint("c ")
    return
end

@cuda kernel_a(true)

This is the post-opt IR, which fails during function merging:

source_filename = "start"
target datalayout = "e-p:64:64:64:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%0 = type { i64 }
%1 = type { i64, i64, i64 }

@global = private unnamed_addr constant [3 x i8] c"a \00", align 1
@global1 = private unnamed_addr constant [3 x i8] c"b \00", align 1
@global2 = private unnamed_addr constant [75 x i8] c"ERROR: a CUDA error was thrown during kernel execution: %s (code %ld, %s)\0A\00", align 1
@global3 = private unnamed_addr constant [3 x i8] c"c \00", align 1
@global4 = private unnamed_addr constant [108 x i8] c"ERROR: a %s was thrown during kernel execution.\0A       Run Julia on debug level 2 for device stack traces.\0A\00", align 1
@global5 = private unnamed_addr constant [110 x i8] c"WARNING: could not signal exception status to the host, execution will continue.\0A         Please file a bug.\0A\00", align 1
@global6 = private unnamed_addr constant [10 x i8] c"exception\00", align 1

declare i64 @snork(i32) local_unnamed_addr

declare i64 @wobble(i32) local_unnamed_addr

declare i32 @vprintf(i8*, i8*) local_unnamed_addr

declare i64 @snork7(i64, { i32, i32, i32 }, { i32, i32, i32 }, i32) local_unnamed_addr

declare i32 @widget(i64, i64) local_unnamed_addr

; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #0

; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #0

define internal void @bar(i64 zeroext %arg) local_unnamed_addr #1 {
bb:
  %tmp = alloca %0, align 8
  %tmp1 = bitcast %0* %tmp to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %tmp1)
  %tmp2 = getelementptr inbounds %0, %0* %tmp, i32 0, i32 0
  store i64 %arg, i64* %tmp2, align 8
  %tmp3 = call i32 @vprintf(i8* getelementptr inbounds ([108 x i8], [108 x i8]* @global4, i32 0, i32 0), i8* nonnull %tmp1)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %tmp1)
  ret void
}

; Function Attrs: nocallback nounwind
declare void @llvm.nvvm.membar.sys() #2

; Function Attrs: noinline noreturn
define internal fastcc void @wombat([1 x i64] %arg, [1 x i32]* nocapture noundef nonnull readonly align 4 dereferenceable(4) %arg1) unnamed_addr #3 {
bb:
  %tmp = alloca %1, align 8
  %tmp2 = getelementptr inbounds [1 x i32], [1 x i32]* %arg1, i32 0, i32 0
  %tmp3 = load i32, i32* %tmp2, align 4, !tbaa !5, !alias.scope !9, !noalias !12
  %tmp4 = call i64 @snork(i32 %tmp3)
  %tmp5 = zext i32 %tmp3 to i64
  %tmp6 = call i64 @wobble(i32 %tmp3)
  %tmp7 = bitcast %1* %tmp to i8*
  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %tmp7)
  %tmp8 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 0
  store i64 %tmp4, i64* %tmp8, align 8
  %tmp9 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 1
  store i64 %tmp5, i64* %tmp9, align 8
  %tmp10 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 2
  store i64 %tmp6, i64* %tmp10, align 8
  %tmp11 = call i32 @vprintf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @global2, i32 0, i32 0), i8* nonnull %tmp7)
  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %tmp7)
  call void @bar(i64 ptrtoint ([10 x i8]* @global6 to i64))
  call void @blam([1 x i64] %arg)
  call void asm sideeffect "exit;", ""() #4
  unreachable
}

define ptx_kernel void @wombat8([1 x i64] %arg, i8 zeroext %arg1) local_unnamed_addr #1 {
bb:
  %tmp = alloca [1 x i32], align 4
  %tmp2 = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @global1, i32 0, i32 0), i8* null)
  %tmp3 = extractvalue [1 x i64] %arg, 0
  %tmp4 = call i64 @snork7(i64 ptrtoint (void ([1 x i64], i8)* @blam9 to i64), { i32, i32, i32 } { i32 1, i32 1, i32 1 }, { i32, i32, i32 } { i32 1, i32 1, i32 1 }, i32 0)
  %tmp5 = inttoptr i64 %tmp4 to [1 x i64]*
  %tmp6 = getelementptr inbounds [1 x i64], [1 x i64]* %tmp5, i32 0, i32 0
  store i64 %tmp3, i64* %tmp6, align 8, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp7 = inttoptr i64 %tmp4 to i8*
  %tmp8 = getelementptr i8, i8* %tmp7, i32 8
  store i8 %arg1, i8* %tmp8, align 1, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp9 = call i32 @widget(i64 %tmp4, i64 0)
  %tmp10 = icmp eq i32 %tmp9, 0
  br i1 %tmp10, label %bb13, label %bb11

bb11:                                             ; preds = %bb
  %tmp12 = getelementptr inbounds [1 x i32], [1 x i32]* %tmp, i32 0, i32 0
  store i32 %tmp9, i32* %tmp12, align 4, !tbaa !21, !alias.scope !23, !noalias !24
  call fastcc void @eggs([1 x i64] %arg, [1 x i32]* %tmp)
  unreachable

bb13:                                             ; preds = %bb
  ret void
}

define internal void @blam([1 x i64] %arg) local_unnamed_addr #1 {
bb:
  %tmp = extractvalue [1 x i64] %arg, 0
  %tmp1 = icmp eq i64 %tmp, 0
  br i1 %tmp1, label %bb4, label %bb2

bb2:                                              ; preds = %bb
  %tmp3 = inttoptr i64 %tmp to i64*
  store i64 1, i64* %tmp3, align 1, !tbaa !17, !alias.scope !19, !noalias !20
  call void @llvm.nvvm.membar.sys()
  br label %bb6

bb4:                                              ; preds = %bb
  %tmp5 = call i32 @vprintf(i8* getelementptr inbounds ([110 x i8], [110 x i8]* @global5, i32 0, i32 0), i8* null)
  br label %bb6

bb6:                                              ; preds = %bb4, %bb2
  ret void
}

define ptx_kernel void @foo([1 x i64] %arg, i8 zeroext %arg1) local_unnamed_addr #1 {
bb:
  %tmp = alloca [1 x i32], align 4
  %tmp2 = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @global, i32 0, i32 0), i8* null)
  %tmp3 = extractvalue [1 x i64] %arg, 0
  %tmp4 = call i64 @snork7(i64 ptrtoint (void ([1 x i64], i8)* @wombat8 to i64), { i32, i32, i32 } { i32 1, i32 1, i32 1 }, { i32, i32, i32 } { i32 1, i32 1, i32 1 }, i32 0)
  %tmp5 = inttoptr i64 %tmp4 to [1 x i64]*
  %tmp6 = getelementptr inbounds [1 x i64], [1 x i64]* %tmp5, i32 0, i32 0
  store i64 %tmp3, i64* %tmp6, align 8, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp7 = inttoptr i64 %tmp4 to i8*
  %tmp8 = getelementptr i8, i8* %tmp7, i32 8
  store i8 %arg1, i8* %tmp8, align 1, !tbaa !17, !alias.scope !19, !noalias !20
  %tmp9 = call i32 @widget(i64 %tmp4, i64 0)
  %tmp10 = icmp eq i32 %tmp9, 0
  br i1 %tmp10, label %bb13, label %bb11

bb11:                                             ; preds = %bb
  %tmp12 = getelementptr inbounds [1 x i32], [1 x i32]* %tmp, i32 0, i32 0
  store i32 %tmp9, i32* %tmp12, align 4, !tbaa !21, !alias.scope !23, !noalias !24
  call fastcc void @wombat([1 x i64] %arg, [1 x i32]* %tmp)
  unreachable

bb13:                                             ; preds = %bb
  ret void
}

; Function Attrs: noinline noreturn
define internal fastcc void @eggs([1 x i64] %arg, [1 x i32]* nocapture noundef nonnull readonly align 4 dereferenceable(4) %arg1) unnamed_addr #3 {
bb:
  %tmp = alloca %1, align 8
  %tmp2 = getelementptr inbounds [1 x i32], [1 x i32]* %arg1, i32 0, i32 0
  %tmp3 = load i32, i32* %tmp2, align 4, !tbaa !5, !alias.scope !9, !noalias !12
  %tmp4 = call i64 @snork(i32 %tmp3)
  %tmp5 = zext i32 %tmp3 to i64
  %tmp6 = call i64 @wobble(i32 %tmp3)
  %tmp7 = bitcast %1* %tmp to i8*
  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %tmp7)
  %tmp8 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 0
  store i64 %tmp4, i64* %tmp8, align 8
  %tmp9 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 1
  store i64 %tmp5, i64* %tmp9, align 8
  %tmp10 = getelementptr inbounds %1, %1* %tmp, i32 0, i32 2
  store i64 %tmp6, i64* %tmp10, align 8
  %tmp11 = call i32 @vprintf(i8* getelementptr inbounds ([75 x i8], [75 x i8]* @global2, i32 0, i32 0), i8* nonnull %tmp7)
  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %tmp7)
  call void @bar(i64 ptrtoint ([10 x i8]* @global6 to i64))
  call void @blam([1 x i64] %arg)
  call void asm sideeffect "exit;", ""() #4
  unreachable
}

define ptx_kernel void @blam9([1 x i64] %arg, i8 zeroext %arg1) local_unnamed_addr #1 {
bb:
  %tmp = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @global3, i32 0, i32 0), i8* null)
  ret void
}

attributes #0 = { argmemonly nocallback nofree nosync nounwind willreturn }
attributes #1 = { "probe-stack"="inline-asm" }
attributes #2 = { nocallback nounwind }
attributes #3 = { noinline noreturn "probe-stack"="inline-asm" }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1}
!julia.kernel = !{!2, !3, !4}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{void ([1 x i64], i8)* @wombat8}
!3 = !{void ([1 x i64], i8)* @blam9}
!4 = !{void ([1 x i64], i8)* @foo}
!5 = !{!6, !6, i64 0}
!6 = !{!"jtbaa_const", !7, i64 0}
!7 = !{!"jtbaa", !8, i64 0}
!8 = !{!"jtbaa"}
!9 = !{!10}
!10 = !{!"jnoalias_const", !11}
!11 = !{!"jnoalias"}
!12 = !{!13, !14, !15, !16}
!13 = !{!"jnoalias_gcframe", !11}
!14 = !{!"jnoalias_stack", !11}
!15 = !{!"jnoalias_data", !11}
!16 = !{!"jnoalias_typemd", !11}
!17 = !{!18, !18, i64 0}
!18 = !{!"jtbaa_data", !7, i64 0}
!19 = !{!15}
!20 = !{!13, !14, !16, !10}
!21 = !{!22, !22, i64 0}
!22 = !{!"jtbaa_stack", !7, i64 0}
!23 = !{!14}
!24 = !{!13, !15, !16, !10}

Or, reduced:

target datalayout = "e-p:64:64:64:32"
target triple = "nvptx64-nvidia-cuda"

%0 = type { i64 }

define internal fastcc void @foo(%0* %tmp) {
bb:
  %tmp1 = getelementptr inbounds %0, %0* %tmp, i32 0, i32 0
  unreachable
}

define internal fastcc void @bar(%0* %tmp) {
bb:
  %tmp1 = getelementptr inbounds %0, %0* %tmp, i32 0, i32 0
  unreachable
}

I guess this DL modification was illegal? I still need to take a closer look.

Metadata

Metadata

Assignees

No one assigned

    Labels

    ptxStuff about the NVIDIA PTX back-end.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions