Skip to content

Commit 7801351

Browse files
authored
optimize constant length memorynew intrinsic (take 2) (#56847)
replaces #55913 (the rebase was more annoying than starting from scratch) This allows the compiler to better understand what's going on for `memorynew` with compile-time constant length, allowing for LLVM level escape analysis in some cases. There is more room to grow this (currently this only optimizes for fairly small Memory since bigger ones would require writing some more LLVM code, and we probably want a size limit on putting Memory on the stack to avoid stackoverflow. For larger ones, we could potentially inline the free so the Memory doesn't have to be swept by the GC, etc. ``` julia> function g() m = Memory{Int}(undef, 2) for i in 1:2 m[i] = i end m[1]+m[2] end julia> @Btime g() 9.735 ns (1 allocation: 48 bytes) #before 1.719 ns (0 allocations: 0 bytes) #after ```
1 parent 10e20f1 commit 7801351

File tree

6 files changed

+182
-47
lines changed

6 files changed

+182
-47
lines changed

src/cgutils.cpp

Lines changed: 105 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3321,7 +3321,7 @@ static Value *emit_genericmemoryptr(jl_codectx_t &ctx, Value *mem, const jl_data
33213321
LoadInst *LI = ctx.builder.CreateAlignedLoad(PPT, addr, Align(sizeof(char*)));
33223322
LI->setOrdering(AtomicOrdering::NotAtomic);
33233323
LI->setMetadata(LLVMContext::MD_nonnull, MDNode::get(ctx.builder.getContext(), None));
3324-
jl_aliasinfo_t aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
3324+
jl_aliasinfo_t aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
33253325
aliasinfo.decorateInst(LI);
33263326
Value *ptr = LI;
33273327
if (AS) {
@@ -3347,7 +3347,7 @@ static Value *emit_genericmemoryowner(jl_codectx_t &ctx, Value *t)
33473347
return emit_guarded_test(ctx, foreign, t, [&] {
33483348
addr = ctx.builder.CreateConstInBoundsGEP1_32(ctx.types().T_jlgenericmemory, m, 1);
33493349
LoadInst *owner = ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, addr, Align(sizeof(void*)));
3350-
jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
3350+
jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
33513351
ai.decorateInst(owner);
33523352
return ctx.builder.CreateSelect(ctx.builder.CreateIsNull(owner), t, owner);
33533353
});
@@ -4432,6 +4432,105 @@ static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b)
44324432
}
44334433
#endif
44344434

4435+
static auto *emit_genericmemory_unchecked(jl_codectx_t &ctx, Value *cg_nbytes, Value *cg_typ)
4436+
{
4437+
auto ptls = get_current_ptls(ctx);
4438+
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
4439+
auto *alloc = ctx.builder.CreateCall(call, { ptls, cg_nbytes, cg_typ});
4440+
alloc->setAttributes(call->getAttributes());
4441+
alloc->addRetAttr(Attribute::getWithAlignment(alloc->getContext(), Align(JL_HEAP_ALIGNMENT)));
4442+
call->addRetAttr(Attribute::getWithDereferenceableBytes(call->getContext(), sizeof(jl_genericmemory_t)));
4443+
return alloc;
4444+
}
4445+
4446+
static void emit_memory_zeroinit_and_stores(jl_codectx_t &ctx, jl_datatype_t *typ, Value* alloc, Value* nbytes, Value* nel, int zi)
4447+
{
4448+
auto arg_typename = [&] JL_NOTSAFEPOINT {
4449+
std::string type_str;
4450+
auto eltype = jl_tparam1(typ);
4451+
if (jl_is_datatype(eltype))
4452+
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
4453+
else if (jl_is_uniontype(eltype))
4454+
type_str = "Union";
4455+
else
4456+
type_str = "<unknown type>";
4457+
return "Memory{" + type_str + "}[]";
4458+
};
4459+
setName(ctx.emission_context, alloc, arg_typename);
4460+
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
4461+
Value *decay_alloc = decay_derived(ctx, alloc);
4462+
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
4463+
auto len_store = ctx.builder.CreateAlignedStore(nel, len_field, Align(sizeof(void*)));
4464+
auto aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
4465+
aliasinfo.decorateInst(len_store);
4466+
//This avoids the length store from being deleted which is illegal
4467+
ctx.builder.CreateFence(AtomicOrdering::Release, SyncScope::SingleThread);
4468+
// zeroinit pointers and unions
4469+
if (zi) {
4470+
Value *memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
4471+
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
4472+
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
4473+
aliasinfo.decorateInst(load);
4474+
auto int8t = getInt8Ty(ctx.builder.getContext());
4475+
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), nbytes, Align(sizeof(void*)));
4476+
}
4477+
return;
4478+
}
4479+
4480+
4481+
static jl_cgval_t emit_const_len_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, size_t nel, jl_genericmemory_t *inst)
4482+
{
4483+
if (nel == 0) {
4484+
Value *empty_alloc = track_pjlvalue(ctx, literal_pointer_val(ctx, (jl_value_t*)inst));
4485+
return mark_julia_type(ctx, empty_alloc, true, typ);
4486+
}
4487+
const jl_datatype_layout_t *layout = ((jl_datatype_t*)typ)->layout;
4488+
assert(((jl_datatype_t*)typ)->has_concrete_subtype && layout != NULL);
4489+
size_t elsz = layout->size;
4490+
int isboxed = layout->flags.arrayelem_isboxed;
4491+
int isunion = layout->flags.arrayelem_isunion;
4492+
int zi = ((jl_datatype_t*)typ)->zeroinit;
4493+
if (isboxed)
4494+
elsz = sizeof(void*);
4495+
size_t nbytes;
4496+
bool overflow = __builtin_mul_overflow(nel, elsz, &nbytes);
4497+
if (isunion) {
4498+
// an extra byte for each isbits union memory element, stored at m->ptr + m->length
4499+
overflow |= __builtin_add_overflow(nbytes, nel, &nbytes);
4500+
}
4501+
// overflow if signed size is too big or nel is too big (the latter matters iff elsz==0)
4502+
ssize_t tmp=1;
4503+
overflow |= __builtin_add_overflow(nel, 1, &tmp) || __builtin_add_overflow(nbytes, 1, &tmp);
4504+
if (overflow)
4505+
emit_error(ctx, prepare_call(jlargumenterror_func), "invalid GenericMemory size: the number of elements is either negative or too large for system address width");
4506+
4507+
auto T_size = ctx.types().T_size;
4508+
auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
4509+
auto cg_nbytes = ConstantInt::get(T_size, nbytes);
4510+
auto cg_nel = ConstantInt::get(T_size, nel);
4511+
size_t tot = nbytes + LLT_ALIGN(sizeof(jl_genericmemory_t),JL_SMALL_BYTE_ALIGNMENT);
4512+
// if allocation fits within GC pools
4513+
int pooled = tot <= GC_MAX_SZCLASS;
4514+
Value *alloc, *decay_alloc, *memory_ptr;
4515+
jl_aliasinfo_t aliasinfo;
4516+
if (pooled) {
4517+
alloc = emit_allocobj(ctx, tot, cg_typ, false, JL_SMALL_BYTE_ALIGNMENT);
4518+
decay_alloc = decay_derived(ctx, alloc);
4519+
memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
4520+
setName(ctx.emission_context, memory_ptr, "memory_ptr");
4521+
auto objref = emit_pointer_from_objref(ctx, alloc);
4522+
Value *memory_data = emit_ptrgep(ctx, objref, JL_SMALL_BYTE_ALIGNMENT);
4523+
auto *store = ctx.builder.CreateAlignedStore(memory_data, memory_ptr, Align(sizeof(void*)));
4524+
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
4525+
aliasinfo.decorateInst(store);
4526+
setName(ctx.emission_context, memory_data, "memory_data");
4527+
} else { // just use the dynamic length version since the malloc will be slow anyway
4528+
alloc = emit_genericmemory_unchecked(ctx, cg_nbytes, cg_typ);
4529+
}
4530+
emit_memory_zeroinit_and_stores(ctx, typ, alloc, cg_nbytes, cg_nel, zi);
4531+
return mark_julia_type(ctx, alloc, true, typ);
4532+
}
4533+
44354534
static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval_t nel, jl_genericmemory_t *inst)
44364535
{
44374536
emit_typecheck(ctx, nel, (jl_value_t*)jl_long_type, "memorynew");
@@ -4448,9 +4547,7 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
44484547
if (isboxed)
44494548
elsz = sizeof(void*);
44504549

4451-
auto ptls = get_current_ptls(ctx);
44524550
auto T_size = ctx.types().T_size;
4453-
auto int8t = getInt8Ty(ctx.builder.getContext());
44544551
BasicBlock *emptymemBB, *nonemptymemBB, *retvalBB;
44554552
emptymemBB = BasicBlock::Create(ctx.builder.getContext(), "emptymem");
44564553
nonemptymemBB = BasicBlock::Create(ctx.builder.getContext(), "nonemptymem");
@@ -4466,18 +4563,7 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
44664563
ctx.builder.CreateBr(retvalBB);
44674564
nonemptymemBB->insertInto(ctx.f);
44684565
ctx.builder.SetInsertPoint(nonemptymemBB);
4469-
// else actually allocate mem
4470-
auto arg_typename = [&] JL_NOTSAFEPOINT {
4471-
std::string type_str;
4472-
auto eltype = jl_tparam1(typ);
4473-
if (jl_is_datatype(eltype))
4474-
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
4475-
else if (jl_is_uniontype(eltype))
4476-
type_str = "Union";
4477-
else
4478-
type_str = "<unknown type>";
4479-
return "Memory{" + type_str + "}[]";
4480-
};
4566+
44814567
auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
44824568
auto cg_elsz = ConstantInt::get(T_size, elsz);
44834569

@@ -4501,27 +4587,10 @@ static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval
45014587
overflow = ctx.builder.CreateOr(overflow, tobignel);
45024588
Value *notoverflow = ctx.builder.CreateNot(overflow);
45034589
error_unless(ctx, prepare_call(jlargumenterror_func), notoverflow, "invalid GenericMemory size: the number of elements is either negative or too large for system address width");
4504-
// actually allocate
4505-
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
4506-
Value *alloc = ctx.builder.CreateCall(call, { ptls, nbytes, cg_typ});
4507-
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
4508-
Value *decay_alloc = decay_derived(ctx, alloc);
4509-
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
4510-
auto len_store = ctx.builder.CreateAlignedStore(nel_unboxed, len_field, Align(sizeof(void*)));
4511-
auto aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
4512-
aliasinfo.decorateInst(len_store);
4513-
//This avoids the length store from being deleted which is illegal
4514-
ctx.builder.CreateFence(AtomicOrdering::Release, SyncScope::SingleThread);
4515-
// zeroinit pointers and unions
4516-
if (zi) {
4517-
Value *memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
4518-
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
4519-
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
4520-
aliasinfo.decorateInst(load);
4521-
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), nbytes, Align(sizeof(void*)));
4522-
}
4590+
// actually allocate the memory
45234591

4524-
setName(ctx.emission_context, alloc, arg_typename);
4592+
Value *alloc = emit_genericmemory_unchecked(ctx, nbytes, cg_typ);
4593+
emit_memory_zeroinit_and_stores(ctx, typ, alloc, nbytes, nel_unboxed, zi);
45254594
ctx.builder.CreateBr(retvalBB);
45264595
nonemptymemBB = ctx.builder.GetInsertBlock();
45274596
// phi node to choose which side of branch

src/codegen.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -408,9 +408,9 @@ struct jl_tbaacache_t {
408408
tbaa_arrayptr = tbaa_make_child(mbuilder, "jtbaa_arrayptr", tbaa_array_scalar).first;
409409
tbaa_arraysize = tbaa_make_child(mbuilder, "jtbaa_arraysize", tbaa_array_scalar).first;
410410
tbaa_arrayselbyte = tbaa_make_child(mbuilder, "jtbaa_arrayselbyte", tbaa_array_scalar).first;
411-
tbaa_memoryptr = tbaa_make_child(mbuilder, "jtbaa_memoryptr", tbaa_array_scalar, true).first;
412-
tbaa_memorylen = tbaa_make_child(mbuilder, "jtbaa_memorylen", tbaa_array_scalar, true).first;
413-
tbaa_memoryown = tbaa_make_child(mbuilder, "jtbaa_memoryown", tbaa_array_scalar, true).first;
411+
tbaa_memoryptr = tbaa_make_child(mbuilder, "jtbaa_memoryptr", tbaa_array_scalar).first;
412+
tbaa_memorylen = tbaa_make_child(mbuilder, "jtbaa_memorylen", tbaa_array_scalar).first;
413+
tbaa_memoryown = tbaa_make_child(mbuilder, "jtbaa_memoryown", tbaa_array_scalar).first;
414414
tbaa_const = tbaa_make_child(mbuilder, "jtbaa_const", nullptr, true).first;
415415
}
416416
};
@@ -1179,6 +1179,7 @@ static const auto jl_alloc_genericmemory_unchecked_func = new JuliaFunction<Type
11791179
},
11801180
[](LLVMContext &C) {
11811181
auto FnAttrs = AttrBuilder(C);
1182+
FnAttrs.addAllocKindAttr(AllocFnKind::Alloc);
11821183
FnAttrs.addMemoryAttr(MemoryEffects::argMemOnly(ModRefInfo::Ref) | MemoryEffects::inaccessibleMemOnly(ModRefInfo::ModRef));
11831184
FnAttrs.addAttribute(Attribute::WillReturn);
11841185
FnAttrs.addAttribute(Attribute::NoUnwind);
@@ -1499,6 +1500,10 @@ static const auto pointer_from_objref_func = new JuliaFunction<>{
14991500
AttrBuilder FnAttrs(C);
15001501
FnAttrs.addMemoryAttr(MemoryEffects::none());
15011502
FnAttrs.addAttribute(Attribute::NoUnwind);
1503+
FnAttrs.addAttribute(Attribute::Speculatable);
1504+
FnAttrs.addAttribute(Attribute::WillReturn);
1505+
FnAttrs.addAttribute(Attribute::NoRecurse);
1506+
FnAttrs.addAttribute(Attribute::NoSync);
15021507
return AttributeList::get(C,
15031508
AttributeSet::get(C, FnAttrs),
15041509
Attributes(C, {Attribute::NonNull}),
@@ -4470,7 +4475,17 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
44704475
jl_genericmemory_t *inst = (jl_genericmemory_t*)((jl_datatype_t*)typ)->instance;
44714476
if (inst == NULL)
44724477
return false;
4473-
*ret = emit_memorynew(ctx, typ, argv[2], inst);
4478+
if (argv[2].constant) {
4479+
if (!jl_is_long(argv[2].constant))
4480+
return false;
4481+
size_t nel = jl_unbox_long(argv[2].constant);
4482+
if (nel < 0)
4483+
return false;
4484+
*ret = emit_const_len_memorynew(ctx, typ, nel, inst);
4485+
}
4486+
else {
4487+
*ret = emit_memorynew(ctx, typ, argv[2], inst);
4488+
}
44744489
return true;
44754490
}
44764491

src/llvm-alloc-helpers.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,8 @@ void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs r
250250
return true;
251251
}
252252
if (required.pass.gc_loaded_func == callee) {
253-
required.use_info.haspreserve = true;
254-
required.use_info.hasload = true;
253+
// TODO add manual load->store forwarding
254+
push_inst(inst);
255255
return true;
256256
}
257257
if (required.pass.typeof_func == callee) {

src/llvm-alloc-opt.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -752,11 +752,11 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
752752
call->eraseFromParent();
753753
return;
754754
}
755-
//if (pass.gc_loaded_func == callee) {
756-
// call->replaceAllUsesWith(new_i);
757-
// call->eraseFromParent();
758-
// return;
759-
//}
755+
if (pass.gc_loaded_func == callee) {
756+
// TODO: handle data pointer forwarding, length forwarding, and fence removal
757+
user->replaceUsesOfWith(orig_i, Constant::getNullValue(orig_i->getType()));
758+
return;
759+
}
760760
if (pass.typeof_func == callee) {
761761
++RemovedTypeofs;
762762
call->replaceAllUsesWith(tag);

src/pipeline.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,7 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
527527
JULIA_PASS(FPM.addPass(LateLowerGCPass()));
528528
JULIA_PASS(FPM.addPass(FinalLowerGCPass()));
529529
if (O.getSpeedupLevel() >= 2) {
530+
FPM.addPass(DSEPass());
530531
FPM.addPass(GVNPass());
531532
FPM.addPass(SCCPPass());
532533
FPM.addPass(DCEPass());

test/boundscheck_exec.jl

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,4 +297,54 @@ end
297297
typeintersect(Int, Integer)
298298
end |> only === Type{Int}
299299

300+
if bc_opt == bc_default
301+
@testset "Array/Memory escape analysis" begin
302+
function no_allocate(T::Type{<:Union{Memory, Vector}})
303+
v = T(undef, 2)
304+
v[1] = 2
305+
v[2] = 3
306+
return v[1] + v[2]
307+
end
308+
function test_alloc(::Type{T}; broken=false) where T
309+
@test (@allocated no_allocate(T)) == 0 broken=broken
310+
end
311+
@testset "$T" for T in [Memory, Vector]
312+
@testset "$ET" for ET in [Int, Float32, Union{Int, Float64}]
313+
no_allocate(T{ET}) #compile
314+
# allocations aren't removed for Union eltypes which they theoretically could be eventually
315+
test_alloc(T{ET}, broken=(ET==Union{Int, Float64}))
316+
end
317+
end
318+
function f() # this was causing a bug on an in progress version of #55913.
319+
m = Memory{Float64}(undef, 4)
320+
m .= 1.0
321+
s = 0.0
322+
for x m
323+
s += x
324+
end
325+
s
326+
end
327+
@test f() === 4.0
328+
function confuse_alias_analysis()
329+
mem0 = Memory{Int}(undef, 1)
330+
mem1 = Memory{Int}(undef, 1)
331+
@inbounds mem0[1] = 3
332+
for width in 1:2
333+
@inbounds mem1[1] = mem0[1]
334+
mem0 = mem1
335+
end
336+
mem0[1]
337+
end
338+
@test confuse_alias_analysis() == 3
339+
@test (@allocated confuse_alias_analysis()) == 0
340+
function no_alias_prove(n)
341+
m1 = Memory{Int}(undef,n)
342+
m2 = Memory{Int}(undef,n)
343+
m1 === m2
344+
end
345+
no_alias_prove(1)
346+
@test_broken (@allocated no_alias_prove(5)) == 0
347+
end
348+
end
349+
300350
end

0 commit comments

Comments
 (0)