-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
Open
Labels
compiler:codegenGeneration of LLVM IR and native codeGeneration of LLVM IR and native codeperformanceMust go fasterMust go faster
Description
I don't think this is a big problem,
but it was surprising to me:
julia> int32_or_float32() = rand((rand() > 0.5 ? Int32 : Float32));
julia> function bar()
a = @noinline int32_or_float32()
return a + a
end;
julia> Base.code_ircode(bar) |> only
2 1 ─ %1 = invoke Main.int32_or_float32()::Union{Float32, Int32}
3 │ %2 = (isa)(%1, Float32)::Bool │
│ %3 = (isa)(%1, Float32)::Bool │
│ %4 = (Core.Intrinsics.and_int)(%2, %3)::Bool │
└── goto #3 if not %4 │
2 ─ %6 = π (%1, Float32) │
│ %7 = π (%1, Float32) │
│ %8 = Base.add_float(%6, %7)::Float32 │╻ +
└── goto #9 │
3 ─ %10 = (isa)(%1, Int32)::Bool │
│ %11 = (isa)(%1, Float32)::Bool │
│ %12 = (Core.Intrinsics.and_int)(%10, %11)::Bool │
└── goto #5 if not %12 │
4 ─ %14 = π (%1, Int32) │
│ %15 = π (%1, Float32) │
│ %16 = Base.sitofp(Float32, %14)::Float32 ││╻╷╷╷ promote
│ %17 = Base.add_float(%16, %15)::Float32 ││╻ +
└── goto #9 │
5 ─ %19 = (isa)(%1, Float32)::Bool │
│ %20 = (isa)(%1, Int32)::Bool │
│ %21 = (Core.Intrinsics.and_int)(%19, %20)::Bool │
└── goto #7 if not %21 │
6 ─ %23 = π (%1, Float32) │
│ %24 = π (%1, Int32) │
│ %25 = Base.sitofp(Float32, %24)::Float32 ││╻╷╷╷ promote
│ %26 = Base.add_float(%23, %25)::Float32 ││╻ +
└── goto #9 │
7 ─ nothing::Nothing │
8 ─ %29 = π (%1, Int32) │
│ %30 = π (%1, Int32) │
│ %31 = Base.add_int(%29, %30)::Int32 │╻ +
└── goto #9 │
9 ┄ %33 = φ (#2 => %8, #4 => %17, #6 => %26, #8 => %31)::Union{Float32, Int32}
└── return %33 │
=> Union{Float32, Int32}
Blocks 3, 4, 5, 6 and 7 can not be reached
After LLVM has done its stuff though it all seems good.
julia> @code_llvm bar()
; Function Signature: bar()
; @ REPL[3]:1 within `bar`
define { ptr, i8 } @julia_bar_3404(ptr noalias nocapture noundef nonnull align 4 dereferenceable(4) %union_bytes_return) #0 {
top:
%sret_box = alloca [4 x i8], align 4
; @ REPL[3]:2 within `bar`
%0 = call { ptr, i8 } @j_int32_or_float32_3407(ptr noalias nocapture noundef nonnull %sret_box)
%1 = extractvalue { ptr, i8 } %0, 0
%2 = extractvalue { ptr, i8 } %0, 1
%3 = icmp slt i8 %2, 0
%4 = select i1 %3, ptr %1, ptr %sret_box
; @ REPL[3]:3 within `bar`
%5 = and i8 %2, 127
%.not = icmp eq i8 %5, 1
br i1 %.not, label %union_move, label %union_move3
post_union_move: ; preds = %union_move3, %union_move
%storemerge = phi i32 [ %9, %union_move3 ], [ %8, %union_move ]
%tindex_phi20 = phi i8 [ 2, %union_move3 ], [ 1, %union_move ]
store i32 %storemerge, ptr %union_bytes_return, align 4
%6 = insertvalue { ptr, i8 } { ptr null, i8 undef }, i8 %tindex_phi20, 1
ret { ptr, i8 } %6
union_move: ; preds = %top
; ┌ @ float.jl:478 within `+`
%.unbox = load float, ptr %4, align 4
%7 = fadd float %.unbox, %.unbox
%8 = bitcast float %7 to i32
; └
br label %post_union_move
union_move3: ; preds = %top
; ┌ @ int.jl:87 within `+`
%.unbox15 = load i32, ptr %4, align 4
%9 = shl i32 %.unbox15, 1
; └
br label %post_union_move
}
So I think this has zero runtime cost
julia> versioninfo()
Julia Version 1.12.0-DEV.469
Commit 0d1d4ba068 (2024-05-06 21:04 UTC)
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 20 × 12th Gen Intel(R) Core(TM) i7-12700H
WORD_SIZE: 64
LLVM: libLLVM-17.0.6 (ORCJIT, alderlake)
Threads: 1 default, 0 interactive, 1 GC (on 20 virtual cores)
Environment:
JULIA_PKG_SERVER =
JULIA_PKG_USE_CLI_GIT = true
Metadata
Metadata
Assignees
Labels
compiler:codegenGeneration of LLVM IR and native codeGeneration of LLVM IR and native codeperformanceMust go fasterMust go faster