Skip to content

Commit 58ce713

Browse files
authored
Experiment with a slighly adjusted pipeline (#52850)
Needs #57380 to merge first
1 parent fb0a283 commit 58ce713

File tree

6 files changed

+118
-73
lines changed

6 files changed

+118
-73
lines changed

src/pipeline.cpp

Lines changed: 96 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,22 @@
2424
#include <llvm/Passes/PassPlugin.h>
2525

2626
// NewPM needs to manually include all the pass headers
27+
#include <llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h>
2728
#include <llvm/Transforms/IPO/AlwaysInliner.h>
2829
#include <llvm/Transforms/IPO/Annotation2Metadata.h>
2930
#include <llvm/Transforms/IPO/ConstantMerge.h>
3031
#include <llvm/Transforms/IPO/ForceFunctionAttrs.h>
3132
#include <llvm/Transforms/IPO/GlobalDCE.h>
33+
#include <llvm/Transforms/IPO/GlobalOpt.h>
3234
#include <llvm/Transforms/IPO/StripDeadPrototypes.h>
3335
#include <llvm/Transforms/InstCombine/InstCombine.h>
3436
#include <llvm/Transforms/Instrumentation/AddressSanitizer.h>
3537
#include <llvm/Transforms/Instrumentation/MemorySanitizer.h>
3638
#include <llvm/Transforms/Instrumentation/ThreadSanitizer.h>
3739
#include <llvm/Transforms/Scalar/ADCE.h>
3840
#include <llvm/Transforms/Scalar/AnnotationRemarks.h>
41+
#include <llvm/Transforms/Scalar/BDCE.h>
42+
#include "llvm/Transforms/Scalar/ConstraintElimination.h"
3943
#include <llvm/Transforms/Scalar/CorrelatedValuePropagation.h>
4044
#include <llvm/Transforms/Scalar/DCE.h>
4145
#include <llvm/Transforms/Scalar/DeadStoreElimination.h>
@@ -59,13 +63,17 @@
5963
#include <llvm/Transforms/Scalar/LowerConstantIntrinsics.h>
6064
#include <llvm/Transforms/Scalar/LowerExpectIntrinsic.h>
6165
#include <llvm/Transforms/Scalar/MemCpyOptimizer.h>
66+
#include <llvm/Transforms/Scalar/MergedLoadStoreMotion.h>
6267
#include <llvm/Transforms/Scalar/Reassociate.h>
6368
#include <llvm/Transforms/Scalar/SCCP.h>
6469
#include <llvm/Transforms/Scalar/SROA.h>
6570
#include <llvm/Transforms/Scalar/SimpleLoopUnswitch.h>
6671
#include <llvm/Transforms/Scalar/SimplifyCFG.h>
6772
#include <llvm/Transforms/Scalar/WarnMissedTransforms.h>
73+
#include <llvm/Transforms/Utils/LibCallsShrinkWrap.h>
6874
#include <llvm/Transforms/Utils/InjectTLIMappings.h>
75+
#include <llvm/Transforms/Utils/Mem2Reg.h>
76+
#include <llvm/Transforms/Utils/RelLookupTableConverter.h>
6977
#include <llvm/Transforms/Utils/ModuleUtils.h>
7078
#include <llvm/Transforms/Utils/SimplifyCFGOptions.h>
7179
#include <llvm/Transforms/Vectorize/LoopVectorize.h>
@@ -196,10 +204,9 @@ namespace {
196204
.convertSwitchRangeToICmp(true)
197205
.convertSwitchToLookupTable(true)
198206
.forwardSwitchCondToPhi(true)
199-
//These mess with loop rotation, so only do them after that
207+
.needCanonicalLoops(false)
200208
.hoistCommonInsts(true)
201-
// Causes an SRET assertion error in late-gc-lowering
202-
// .sinkCommonInsts(true)
209+
.sinkCommonInsts(true)
203210
;
204211
}
205212

@@ -341,10 +348,16 @@ static void buildEarlySimplificationPipeline(ModulePassManager &MPM, PassBuilder
341348
FPM.addPass(DCEPass());
342349
FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions()));
343350
if (O.getSpeedupLevel() >= 1) {
344-
// TODO check the LLVM 15 default.
345-
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
351+
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
352+
FPM.addPass(EarlyCSEPass());
346353
}
347354
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
355+
if (O.getSpeedupLevel() >= 1) {
356+
FunctionPassManager GlobalFPM;
357+
MPM.addPass(GlobalOptPass());
358+
GlobalFPM.addPass(PromotePass());
359+
GlobalFPM.addPass(InstCombinePass());
360+
}
348361
}
349362
invokeEarlySimplificationCallbacks(MPM, PB, O);
350363
}
@@ -379,22 +392,24 @@ static void buildEarlyOptimizerPipeline(ModulePassManager &MPM, PassBuilder *PB,
379392
if (O.getSpeedupLevel() >= 1) {
380393
FunctionPassManager FPM;
381394
if (O.getSpeedupLevel() >= 2) {
382-
// TODO check the LLVM 15 default.
383-
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
384-
// SROA can duplicate PHI nodes which can block LowerSIMD
385-
FPM.addPass(InstCombinePass());
386-
FPM.addPass(JumpThreadingPass());
387-
FPM.addPass(CorrelatedValuePropagationPass());
388-
FPM.addPass(ReassociatePass());
389-
FPM.addPass(EarlyCSEPass());
390-
JULIA_PASS(FPM.addPass(AllocOptPass()));
391-
} else { // if (O.getSpeedupLevel() >= 1) (exactly)
392-
FPM.addPass(InstCombinePass());
393-
FPM.addPass(EarlyCSEPass());
394-
}
395-
invokePeepholeEPCallbacks(FPM, PB, O);
396-
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
395+
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
396+
FPM.addPass(EarlyCSEPass(true));
397+
FPM.addPass(InstCombinePass());
398+
FPM.addPass(AggressiveInstCombinePass());
399+
FPM.addPass(JumpThreadingPass());
400+
FPM.addPass(CorrelatedValuePropagationPass());
401+
FPM.addPass(LibCallsShrinkWrapPass());
402+
FPM.addPass(ReassociatePass());
403+
FPM.addPass(ConstraintEliminationPass());
404+
JULIA_PASS(FPM.addPass(AllocOptPass()));
405+
} else { // if (O.getSpeedupLevel() >= 1) (exactly)
406+
FPM.addPass(EarlyCSEPass());
407+
FPM.addPass(InstCombinePass());
408+
}
409+
invokePeepholeEPCallbacks(FPM, PB, O);
410+
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), /*UseMemorySSA = */true));
397411
}
412+
MPM.addPass(GlobalOptPass());
398413
MPM.addPass(GlobalDCEPass());
399414
}
400415
MPM.addPass(AfterEarlyOptimizationMarkerPass());
@@ -407,41 +422,41 @@ static void buildLoopOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *PB
407422
LoopPassManager LPM;
408423
LPM.addPass(LowerSIMDLoopPass());
409424
if (O.getSpeedupLevel() >= 2) {
410-
LPM.addPass(LoopRotatePass());
425+
LPM.addPass(LoopInstSimplifyPass());
426+
LPM.addPass(LoopSimplifyCFGPass());
427+
LPM.addPass(BeforeLICMMarkerPass());
428+
auto opts = LICMOptions();
429+
opts.AllowSpeculation = false;
430+
LPM.addPass(LICMPass(opts));
431+
LPM.addPass(JuliaLICMPass());
432+
LPM.addPass(LoopRotatePass(true, false));
433+
LPM.addPass(LICMPass(LICMOptions()));
434+
LPM.addPass(JuliaLICMPass());
435+
LPM.addPass(AfterLICMMarkerPass());
436+
LPM.addPass(SimpleLoopUnswitchPass(/*NonTrivial*/true, true));
411437
}
412438
invokeLateLoopOptimizationCallbacks(LPM, PB, O);
413439
//We don't know if the loop callbacks support MSSA
414-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */false));
415-
}
416-
if (O.getSpeedupLevel() >= 2) {
417-
LoopPassManager LPM;
418-
LPM.addPass(BeforeLICMMarkerPass());
419-
LPM.addPass(LICMPass(LICMOptions()));
420-
LPM.addPass(JuliaLICMPass());
421-
LPM.addPass(SimpleLoopUnswitchPass(/*NonTrivial*/true, true));
422-
LPM.addPass(LICMPass(LICMOptions()));
423-
LPM.addPass(JuliaLICMPass());
424-
LPM.addPass(AfterLICMMarkerPass());
425-
//LICM needs MemorySSA now, so we must use it
426440
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */true));
427441
}
428-
if (O.getSpeedupLevel() >= 2) {
442+
if (O.getSpeedupLevel() >= 2)
429443
FPM.addPass(IRCEPass());
430-
}
431444
{
432445
LoopPassManager LPM;
433446
LPM.addPass(BeforeLoopSimplificationMarkerPass());
434447
if (O.getSpeedupLevel() >= 2) {
435-
LPM.addPass(LoopInstSimplifyPass());
436448
LPM.addPass(LoopIdiomRecognizePass());
437449
LPM.addPass(IndVarSimplifyPass());
450+
LPM.addPass(SimpleLoopUnswitchPass(/*NonTrivial*/true, true));
438451
LPM.addPass(LoopDeletionPass());
439452
// This unroll will only unroll loops when the trip count is known and small,
440453
// so that no loop remains
441454
LPM.addPass(LoopFullUnrollPass());
442455
}
443456
invokeLoopOptimizerEndCallbacks(LPM, PB, O);
444457
LPM.addPass(AfterLoopSimplificationMarkerPass());
458+
FPM.addPass(SimplifyCFGPass(basicSimplifyCFGOptions()));
459+
FPM.addPass(InstCombinePass());
445460
//We don't know if the loop end callbacks support MSSA
446461
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */false));
447462
}
@@ -454,17 +469,28 @@ static void buildScalarOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *
454469
if (options.enable_scalar_optimizations) {
455470
if (O.getSpeedupLevel() >= 2) {
456471
JULIA_PASS(FPM.addPass(AllocOptPass()));
457-
// TODO check the LLVM 15 default.
458-
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
459-
FPM.addPass(InstSimplifyPass());
472+
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
473+
FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
474+
FPM.addPass(MergedLoadStoreMotionPass());
460475
FPM.addPass(GVNPass());
461-
FPM.addPass(MemCpyOptPass());
462476
FPM.addPass(SCCPPass());
477+
FPM.addPass(BDCEPass());
478+
FPM.addPass(InstCombinePass());
463479
FPM.addPass(CorrelatedValuePropagationPass());
464-
FPM.addPass(DCEPass());
480+
FPM.addPass(ADCEPass());
481+
FPM.addPass(MemCpyOptPass());
482+
FPM.addPass(DSEPass());
465483
FPM.addPass(IRCEPass());
466-
FPM.addPass(InstCombinePass());
467484
FPM.addPass(JumpThreadingPass());
485+
FPM.addPass(ConstraintEliminationPass());
486+
} else if (O.getSpeedupLevel() >= 1) {
487+
JULIA_PASS(FPM.addPass(AllocOptPass()));
488+
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
489+
FPM.addPass(MemCpyOptPass());
490+
FPM.addPass(SCCPPass());
491+
FPM.addPass(BDCEPass());
492+
FPM.addPass(InstCombinePass());
493+
FPM.addPass(ADCEPass());
468494
}
469495
if (O.getSpeedupLevel() >= 3) {
470496
FPM.addPass(GVNPass());
@@ -476,12 +502,15 @@ static void buildScalarOptimizerPipeline(FunctionPassManager &FPM, PassBuilder *
476502
JULIA_PASS(FPM.addPass(AllocOptPass()));
477503
{
478504
LoopPassManager LPM;
479-
LPM.addPass(LoopDeletionPass());
480-
LPM.addPass(LoopInstSimplifyPass());
481-
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
505+
LPM.addPass(LICMPass(LICMOptions()));
506+
LPM.addPass(JuliaLICMPass());
507+
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA = */true));
482508
}
483-
FPM.addPass(LoopDistributePass());
484-
}
509+
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
510+
FPM.addPass(InstCombinePass());
511+
} else if (O.getSpeedupLevel() >= 1)
512+
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
513+
485514
invokeScalarOptimizerCallbacks(FPM, PB, O);
486515
}
487516
FPM.addPass(AfterScalarOptimizationMarkerPass());
@@ -491,19 +520,27 @@ static void buildVectorPipeline(FunctionPassManager &FPM, PassBuilder *PB, Optim
491520
FPM.addPass(BeforeVectorizationMarkerPass());
492521
if (options.enable_vector_pipeline) {
493522
//TODO look into loop vectorize options
523+
// Rerotate loops that might have been unrotated in the simplification
524+
LoopPassManager LPM;
525+
LPM.addPass(LoopRotatePass());
526+
LPM.addPass(LoopDeletionPass());
527+
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
528+
FPM.addPass(LoopDistributePass());
494529
FPM.addPass(InjectTLIMappings());
495530
FPM.addPass(LoopVectorizePass());
496531
FPM.addPass(LoopLoadEliminationPass());
497-
FPM.addPass(InstCombinePass());
498532
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
533+
FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
534+
FPM.addPass(EarlyCSEPass());
535+
FPM.addPass(CorrelatedValuePropagationPass());
536+
FPM.addPass(InstCombinePass());
499537
FPM.addPass(SLPVectorizerPass());
500-
invokeVectorizerCallbacks(FPM, PB, O);
501538
FPM.addPass(VectorCombinePass());
502-
FPM.addPass(ADCEPass());
503-
//TODO add BDCEPass here?
504-
// This unroll will unroll vectorized loops
505-
// as well as loops that we tried but failed to vectorize
539+
invokeVectorizerCallbacks(FPM, PB, O);
506540
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(O.getSpeedupLevel(), /*OnlyWhenForced = */ false, /*ForgetSCEV = */false)));
541+
FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
542+
FPM.addPass(InstSimplifyPass());
543+
FPM.addPass(AfterVectorizationMarkerPass());
507544
}
508545
FPM.addPass(AfterVectorizationMarkerPass());
509546
}
@@ -525,18 +562,18 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
525562
FunctionPassManager FPM;
526563
JULIA_PASS(FPM.addPass(LateLowerGCPass()));
527564
JULIA_PASS(FPM.addPass(FinalLowerGCPass()));
528-
if (O.getSpeedupLevel() >= 2) {
529-
FPM.addPass(DSEPass());
530-
FPM.addPass(GVNPass());
531-
FPM.addPass(SCCPPass());
532-
FPM.addPass(DCEPass());
533-
}
534565
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
535566
}
536567
JULIA_PASS(MPM.addPass(LowerPTLSPass(options.dump_native)));
537568
MPM.addPass(RemoveJuliaAddrspacesPass()); //TODO: Make this conditional on arches (GlobalISel doesn't like our addrsspaces)
538569
if (O.getSpeedupLevel() >= 1) {
539570
FunctionPassManager FPM;
571+
if (O.getSpeedupLevel() >= 2) {
572+
FPM.addPass(DSEPass());
573+
FPM.addPass(GVNPass());
574+
FPM.addPass(SCCPPass());
575+
FPM.addPass(DCEPass());
576+
}
540577
FPM.addPass(InstCombinePass());
541578
FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
542579
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));

test/boundscheck_exec.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ end |> only === Type{Int}
299299

300300
if bc_opt == bc_default
301301
# Array/Memory escape analysis
302-
function no_allocate(T::Type{<:Union{Memory, Vector}})
302+
function no_allocate(T::Type{<:Union{Memory}})
303303
v = T(undef, 2)
304304
v[1] = 2
305305
v[2] = 3
@@ -308,7 +308,7 @@ if bc_opt == bc_default
308308
function test_alloc(::Type{T}; broken=false) where T
309309
@test (@allocated no_allocate(T)) == 0 broken=broken
310310
end
311-
for T in [Memory, Vector]
311+
for T in [Memory] # This requires changing the pointer_from_objref to something llvm sees through
312312
for ET in [Int, Float32, Union{Int, Float64}]
313313
no_allocate(T{ET}) #compile
314314
# allocations aren't removed for Union eltypes which they theoretically could be eventually

test/llvmpasses/image-codegen.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
# CHECK-NOT: private global
1515
# CHECK: jl_global
1616
# COM: we emit both declarations and definitions, so we may see either style in the IR
17-
# CHECK-SAME: = {{(external )?}}global
18-
# CHECK: julia_f_
17+
# CHECK-SAME: = {{(external )?}}
1918
# CHECK-NOT: internal global
2019
# CHECK-NOT: private global
2120

test/llvmpasses/late-lower-gc.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,4 @@ define void @decayar([2 x {} addrspace(10)* addrspace(11)*] %ar) {
207207
; CHECK-NEXT: !10 = distinct !{!10}
208208
; CHECK-NEXT: !11 = !{!12, !12, i64 0}
209209
; CHECK-NEXT: !12 = !{!"jtbaa_const", !3}
210+

test/llvmpasses/pipeline-o2.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,18 @@ function loopedlength(arr)
131131
end
132132
len
133133
end
134+
# COM: Vector
135+
# ALL-LABEL: @julia_memset_like
136+
# ALL: vector.body
137+
138+
# COM: Memory
139+
# ALL-LABEL: @julia_memset_like
140+
# ALL: vector.body
141+
function memset_like(mem)
142+
for idx in eachindex(mem)
143+
mem[idx] = 1.0
144+
end
145+
end
134146

135147
emit(iterate_read, Vector{Int64})
136148
emit(iterate_write, Vector{Int64}, Vector{Int64})
@@ -150,3 +162,6 @@ emit(sumloop, Int64)
150162
emit(simd_sumloop, Float32)
151163

152164
emit(loopedlength, Vector{Int64})
165+
166+
emit(memset_like, Vector{Float64})
167+
emit(memset_like, Memory{Float64})

test/llvmpasses/pipeline-prints.ll

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -285,25 +285,18 @@ attributes #2 = { inaccessiblemem_or_argmemonly }
285285

286286
; COM: InstSimplify/InstCombine should kill this zext-trunc pair
287287
; AFTEREARLYSIMPLIFICATION: [[ZEXT:%.*]] = zext i1 {{%.*}} to i8
288-
; AFTEREARLYSIMPLIFICATION-NEXT: trunc i8 [[ZEXT]] to i1
289-
290-
; BEFOREEARLYOPTIMIZATION: [[ZEXT:%.*]] = zext i1 {{%.*}} to i8
291-
; BEFOREEARLYOPTIMIZATION-NEXT: trunc i8 [[ZEXT]] to i1
292288

293289
; AFTEREARLYOPTIMIZATION-NOT: zext i1 {{%.*}} to i8
294-
; AFTEREARLYOPTIMIZATION-NOT: trunc i8 {{%.*}} to i1
295290

296291
; BEFORELOOPOPTIMIZATION-NOT: zext i1 {{%.*}} to i8
297-
; BEFORELOOPOPTIMIZATION-NOT: trunc i8 {{%.*}} to i1
298292

299293
; COM: Loop simplification makes the exit condition obvious
300294
; AFTERLOOPSIMPLIFICATION: L35.lr.ph:
301295
; AFTERLOOPSIMPLIFICATION: add nuw nsw
302296

303-
; COM: Scalar optimization removes the previous add from the preheader
304-
; AFTERSCALAROPTIMIZATION: L35.lr.ph:
305-
; AFTERSCALAROPTIMIZATION-NOT: add nuw nsw
306-
; AFTERSCALAROPTIMIZATION: br label %L35
297+
; COM: Scalar optimization removes the preheader
298+
; AFTERSCALAROPTIMIZATION: L17:
299+
; AFTERSCALAROPTIMIZATION: icmp eq i64 {{%.*}}, 1,
307300

308301
; COM: Vectorization does stuff
309302
; AFTERVECTORIZATION: vector.body

0 commit comments

Comments
 (0)