|
1 | 1 | # LLVM IR optimization
|
2 | 2 |
|
3 |
| -function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=2) |
4 |
| - optimize_newpm!(job, mod; opt_level) |
5 |
| - # TODO: clean up |
6 |
| - return |
7 |
| -end |
8 |
| - |
9 |
| - |
10 |
| -## new pm |
11 |
| - |
12 |
| -function optimize_newpm!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level) |
| 3 | +function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=1) |
13 | 4 | tm = llvm_machine(job.config.target)
|
14 | 5 |
|
15 | 6 | global current_job
|
@@ -292,279 +283,6 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
|
292 | 283 | end
|
293 | 284 |
|
294 | 285 |
|
295 |
| -## legacy pm |
296 |
| - |
297 |
| -function optimize_legacypm!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level) |
298 |
| - triple = llvm_triple(job.config.target) |
299 |
| - tm = llvm_machine(job.config.target) |
300 |
| - |
301 |
| - global current_job |
302 |
| - current_job = job |
303 |
| - |
304 |
| - @dispose pm=ModulePassManager() begin |
305 |
| - addTargetPasses!(pm, tm, triple) |
306 |
| - addOptimizationPasses!(pm, opt_level) |
307 |
| - run!(pm, mod) |
308 |
| - end |
309 |
| - |
310 |
| - # NOTE: we need to use multiple distinct pass managers to force pass ordering; |
311 |
| - # intrinsics should never get lowered before Julia has optimized them. |
312 |
| - # XXX: why doesn't the barrier noop pass work here? |
313 |
| - |
314 |
| - # lower intrinsics |
315 |
| - @dispose pm=ModulePassManager() begin |
316 |
| - addTargetPasses!(pm, tm, triple) |
317 |
| - |
318 |
| - if !uses_julia_runtime(job) |
319 |
| - lower_gc_frame!(pm) |
320 |
| - end |
321 |
| - |
322 |
| - if job.config.kernel |
323 |
| - # GC lowering is the last pass that may introduce calls to the runtime library, |
324 |
| - # and thus additional uses of the kernel state intrinsic. |
325 |
| - # TODO: now that all kernel state-related passes are being run here, merge some? |
326 |
| - add_kernel_state!(pm) |
327 |
| - lower_kernel_state!(pm) |
328 |
| - cleanup_kernel_state!(pm) |
329 |
| - end |
330 |
| - |
331 |
| - if !uses_julia_runtime(job) |
332 |
| - # remove dead uses of ptls |
333 |
| - aggressive_dce!(pm) |
334 |
| - lower_ptls!(pm) |
335 |
| - end |
336 |
| - |
337 |
| - if uses_julia_runtime(job) |
338 |
| - lower_exc_handlers!(pm) |
339 |
| - end |
340 |
| - # the Julia GC lowering pass also has some clean-up that is required |
341 |
| - late_lower_gc_frame!(pm) |
342 |
| - if uses_julia_runtime(job) |
343 |
| - final_lower_gc!(pm) |
344 |
| - end |
345 |
| - |
346 |
| - remove_ni!(pm) |
347 |
| - remove_julia_addrspaces!(pm) |
348 |
| - |
349 |
| - if uses_julia_runtime(job) |
350 |
| - # We need these two passes and the instcombine below |
351 |
| - # after GC lowering to let LLVM do some constant propagation on the tags. |
352 |
| - # and remove some unnecessary write barrier checks. |
353 |
| - gvn!(pm) |
354 |
| - sccp!(pm) |
355 |
| - # Remove dead use of ptls |
356 |
| - dce!(pm) |
357 |
| - LLVM.Interop.lower_ptls!(pm, dump_native(job)) |
358 |
| - instruction_combining!(pm) |
359 |
| - # Clean up write barrier and ptls lowering |
360 |
| - cfgsimplification!(pm) |
361 |
| - end |
362 |
| - |
363 |
| - # Julia's operand bundles confuse the inliner, so repeat here now they are gone. |
364 |
| - # FIXME: we should fix the inliner so that inlined code gets optimized early-on |
365 |
| - always_inliner!(pm) |
366 |
| - |
367 |
| - # some of Julia's optimization passes happen _after_ lowering intrinsics |
368 |
| - combine_mul_add!(pm) |
369 |
| - div_rem_pairs!(pm) |
370 |
| - |
371 |
| - run!(pm, mod) |
372 |
| - end |
373 |
| - |
374 |
| - # target-specific optimizations |
375 |
| - optimize_module!(job, mod) |
376 |
| - |
377 |
| - # we compile a module containing the entire call graph, |
378 |
| - # so perform some interprocedural optimizations. |
379 |
| - # |
380 |
| - # for some reason, these passes need to be distinct from the regular optimization chain, |
381 |
| - # or certain values (such as the constant arrays used to populare llvm.compiler.user ad |
382 |
| - # part of the LateLowerGCFrame pass) aren't collected properly. |
383 |
| - # |
384 |
| - # these might not always be safe, as Julia's IR metadata isn't designed for IPO. |
385 |
| - @dispose pm=ModulePassManager() begin |
386 |
| - addTargetPasses!(pm, tm, triple) |
387 |
| - |
388 |
| - # simplify function calls that don't use the returned value |
389 |
| - dead_arg_elimination!(pm) |
390 |
| - |
391 |
| - run!(pm, mod) |
392 |
| - end |
393 |
| - |
394 |
| - return |
395 |
| -end |
396 |
| - |
397 |
| -function addTargetPasses!(pm, tm, triple) |
398 |
| - add_library_info!(pm, triple) |
399 |
| - add_transform_info!(pm, tm) |
400 |
| -end |
401 |
| - |
402 |
| -# Based on Julia's optimization pipeline, minus the SLP and loop vectorizers. |
403 |
| -function addOptimizationPasses!(pm, opt_level) |
404 |
| - # compare with the using Julia's optimization pipeline directly: |
405 |
| - #ccall(:jl_add_optimization_passes, Cvoid, |
406 |
| - # (LLVM.API.LLVMPassManagerRef, Cint, Cint), |
407 |
| - # pm, opt_level, #=lower_intrinsics=# 0) |
408 |
| - #return |
409 |
| - |
410 |
| - # NOTE: LLVM 12 disabled the hoisting of common instruction |
411 |
| - # before loop vectorization (https://reviews.llvm.org/D84108). |
412 |
| - # |
413 |
| - # This is re-enabled with calls to cfg_simplify here, |
414 |
| - # to merge allocations and sometimes eliminate them, |
415 |
| - # since AllocOpt does not handle PhiNodes. |
416 |
| - # Enable this instruction hoisting because of this and Union benchmarks. |
417 |
| - |
418 |
| - constant_merge!(pm) |
419 |
| - |
420 |
| - if opt_level < 2 |
421 |
| - cpu_features!(pm) |
422 |
| - if opt_level == 1 |
423 |
| - instruction_simplify!(pm) |
424 |
| - end |
425 |
| - if LLVM.version() >= v"12" |
426 |
| - cfgsimplification!(pm; hoist_common_insts=true) |
427 |
| - else |
428 |
| - cfgsimplification!(pm) |
429 |
| - end |
430 |
| - if opt_level == 1 |
431 |
| - scalar_repl_aggregates!(pm) |
432 |
| - instruction_combining!(pm) |
433 |
| - early_cse!(pm) |
434 |
| - # maybe add GVN? |
435 |
| - # also try GVNHoist and GVNSink |
436 |
| - end |
437 |
| - mem_cpy_opt!(pm) |
438 |
| - always_inliner!(pm) # Respect always_inline |
439 |
| - lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop |
440 |
| - return |
441 |
| - end |
442 |
| - |
443 |
| - propagate_julia_addrsp!(pm) |
444 |
| - scoped_no_alias_aa!(pm) |
445 |
| - type_based_alias_analysis!(pm) |
446 |
| - if opt_level >= 3 |
447 |
| - basic_alias_analysis!(pm) |
448 |
| - end |
449 |
| - if LLVM.version() >= v"12" |
450 |
| - cfgsimplification!(pm; hoist_common_insts=true) |
451 |
| - else |
452 |
| - cfgsimplification!(pm) |
453 |
| - end |
454 |
| - dce!(pm) |
455 |
| - scalar_repl_aggregates!(pm) |
456 |
| - |
457 |
| - #mem_cpy_opt!(pm) |
458 |
| - |
459 |
| - always_inliner!(pm) # Respect always_inline |
460 |
| - |
461 |
| - # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard |
462 |
| - # time merging the `alloca` for the unboxed data and the `alloca` created by |
463 |
| - # the `alloc_opt` pass. |
464 |
| - |
465 |
| - alloc_opt!(pm) |
466 |
| - # consider AggressiveInstCombinePass at optlevel > 2 |
467 |
| - instruction_combining!(pm) |
468 |
| - if LLVM.version() >= v"12" |
469 |
| - cfgsimplification!(pm; hoist_common_insts=true) |
470 |
| - else |
471 |
| - cfgsimplification!(pm) |
472 |
| - end |
473 |
| - cpu_features!(pm) |
474 |
| - scalar_repl_aggregates!(pm) |
475 |
| - # SROA can duplicate PHI nodes which can block LowerSIMD |
476 |
| - instruction_combining!(pm) |
477 |
| - jump_threading!(pm) |
478 |
| - correlated_value_propagation!(pm) |
479 |
| - |
480 |
| - reassociate!(pm) |
481 |
| - |
482 |
| - early_cse!(pm) |
483 |
| - |
484 |
| - # Load forwarding above can expose allocations that aren't actually used |
485 |
| - # remove those before optimizing loops. |
486 |
| - alloc_opt!(pm) |
487 |
| - loop_rotate!(pm) |
488 |
| - # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) |
489 |
| - |
490 |
| - # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards |
491 |
| - lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop |
492 |
| - licm!(pm) |
493 |
| - julia_licm!(pm) |
494 |
| - if LLVM.version() >= v"15" |
495 |
| - simple_loop_unswitch_legacy!(pm) |
496 |
| - else |
497 |
| - # XXX: simple loop unswitch is available on older versions of LLVM too, |
498 |
| - # but using this pass instead of the old one breaks Metal.jl. |
499 |
| - loop_unswitch!(pm) |
500 |
| - end |
501 |
| - licm!(pm) |
502 |
| - julia_licm!(pm) |
503 |
| - inductive_range_check_elimination!(pm) |
504 |
| - # Subsequent passes not stripping metadata from terminator |
505 |
| - instruction_simplify!(pm) |
506 |
| - loop_idiom!(pm) |
507 |
| - ind_var_simplify!(pm) |
508 |
| - loop_deletion!(pm) |
509 |
| - loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll |
510 |
| - |
511 |
| - # Run our own SROA on heap objects before LLVM's |
512 |
| - alloc_opt!(pm) |
513 |
| - # Re-run SROA after loop-unrolling (useful for small loops that operate, |
514 |
| - # over the structure of an aggregate) |
515 |
| - scalar_repl_aggregates!(pm) |
516 |
| - # might not be necessary: |
517 |
| - instruction_simplify!(pm) |
518 |
| - |
519 |
| - gvn!(pm) |
520 |
| - mem_cpy_opt!(pm) |
521 |
| - sccp!(pm) |
522 |
| - |
523 |
| - # These next two passes must come before IRCE to eliminate the bounds check in #43308 |
524 |
| - correlated_value_propagation!(pm) |
525 |
| - dce!(pm) |
526 |
| - |
527 |
| - inductive_range_check_elimination!(pm) # Must come between the two GVN passes |
528 |
| - |
529 |
| - # Run instcombine after redundancy elimination to exploit opportunities |
530 |
| - # opened up by them. |
531 |
| - # This needs to be InstCombine instead of InstSimplify to allow |
532 |
| - # loops over Union-typed arrays to vectorize. |
533 |
| - instruction_combining!(pm) |
534 |
| - jump_threading!(pm) |
535 |
| - if opt_level >= 3 |
536 |
| - gvn!(pm) # Must come after JumpThreading and before LoopVectorize |
537 |
| - end |
538 |
| - dead_store_elimination!(pm) |
539 |
| - |
540 |
| - # More dead allocation (store) deletion before loop optimization |
541 |
| - # consider removing this: |
542 |
| - alloc_opt!(pm) |
543 |
| - # see if all of the constant folding has exposed more loops |
544 |
| - # to simplification and deletion |
545 |
| - # this helps significantly with cleaning up iteration |
546 |
| - cfgsimplification!(pm) # See note above, don't hoist instructions before LV |
547 |
| - loop_deletion!(pm) |
548 |
| - instruction_combining!(pm) |
549 |
| - loop_vectorize!(pm) |
550 |
| - loop_load_elimination!(pm) |
551 |
| - # Cleanup after LV pass |
552 |
| - instruction_combining!(pm) |
553 |
| - if LLVM.version() >= v"12" |
554 |
| - cfgsimplification!(pm; # Aggressive CFG simplification |
555 |
| - forward_switch_cond_to_phi=true, |
556 |
| - convert_switch_to_lookup_table=true, |
557 |
| - need_canonical_loop=true, |
558 |
| - hoist_common_insts=true, |
559 |
| - #sink_common_insts=true # FIXME: Causes assertion in llvm-late-lowering |
560 |
| - ) |
561 |
| - else |
562 |
| - cfgsimplification!(pm) |
563 |
| - end |
564 |
| - |
565 |
| - aggressive_dce!(pm) |
566 |
| -end |
567 |
| - |
568 | 286 |
|
569 | 287 | ## custom passes
|
570 | 288 |
|
|
0 commit comments