Skip to content

Commit 8fd9209

Browse files
authored
Merge pull request #2061 from cfallin/aarch64-amode
Aarch64 codegen quality: support more general add+extend address computations.
2 parents 9b340f2 + f9b98f0 commit 8fd9209

File tree

5 files changed

+441
-82
lines changed

5 files changed

+441
-82
lines changed

cranelift/codegen/src/isa/aarch64/lower.rs

Lines changed: 218 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
//!
33
//! TODO: opportunities for better code generation:
44
//!
5-
//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns; recognize
6-
//! and incorporate sign/zero extension on indices. Recognize pre/post-index
7-
//! opportunities.
5+
//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns. Recognize
6+
//! pre/post-index opportunities.
87
//!
98
//! - Floating-point immediates (FIMM instruction).
109
@@ -21,8 +20,9 @@ use crate::isa::aarch64::AArch64Backend;
2120

2221
use super::lower_inst;
2322

24-
use log::debug;
23+
use log::{debug, trace};
2524
use regalloc::{Reg, RegClass, Writable};
25+
use smallvec::SmallVec;
2626

2727
//============================================================================
2828
// Result enum types.
@@ -573,105 +573,251 @@ pub(crate) fn alu_inst_immshift(
573573
// Lowering: addressing mode support. Takes instruction directly, rather
574574
// than an `InsnInput`, to do more introspection.
575575

576+
/// 32-bit addends that make up an address: an input, and an extension mode on that
577+
/// input.
578+
type AddressAddend32List = SmallVec<[(Reg, ExtendOp); 4]>;
579+
/// 64-bit addends that make up an address: just an input.
580+
type AddressAddend64List = SmallVec<[Reg; 4]>;
581+
582+
/// Collect all addends that feed into an address computation, with extend-modes
583+
/// on each. Note that a load/store may have multiple address components (and
584+
/// the CLIF semantics are that these components are added to form the final
585+
/// address), but sometimes the CLIF that we receive still has arguments that
586+
/// refer to `iadd` instructions. We also want to handle uextend/sextend below
587+
/// the add(s).
588+
///
589+
/// We match any 64-bit add (and descend into its inputs), and we match any
590+
/// 32-to-64-bit sign or zero extension. The returned addend-list will use
591+
/// NarrowValueMode values to indicate how to extend each input:
592+
///
593+
/// - NarrowValueMode::None: the associated input is 64 bits wide; no extend.
594+
/// - NarrowValueMode::SignExtend64: the associated input is 32 bits wide;
595+
/// do a sign-extension.
596+
/// - NarrowValueMode::ZeroExtend64: the associated input is 32 bits wide;
597+
/// do a zero-extension.
598+
///
599+
/// We do not descend further into the inputs of extensions, because supporting
600+
/// (e.g.) a 32-bit add that is later extended would require additional masking
601+
/// of high-order bits, which is too complex. So, in essence, we descend any
602+
/// number of adds from the roots, collecting all 64-bit address addends; then
603+
/// possibly support extensions at these leaves.
604+
fn collect_address_addends<C: LowerCtx<I = Inst>>(
605+
ctx: &mut C,
606+
roots: &[InsnInput],
607+
) -> (AddressAddend64List, AddressAddend32List, i64) {
608+
let mut result32: AddressAddend32List = SmallVec::new();
609+
let mut result64: AddressAddend64List = SmallVec::new();
610+
let mut offset: i64 = 0;
611+
612+
let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect();
613+
614+
while let Some(input) = workqueue.pop() {
615+
debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64);
616+
if let Some((op, insn)) = maybe_input_insn_multi(
617+
ctx,
618+
input,
619+
&[
620+
Opcode::Uextend,
621+
Opcode::Sextend,
622+
Opcode::Iadd,
623+
Opcode::Iconst,
624+
],
625+
) {
626+
match op {
627+
Opcode::Uextend | Opcode::Sextend if ty_bits(ctx.input_ty(insn, 0)) == 32 => {
628+
let extendop = if op == Opcode::Uextend {
629+
ExtendOp::UXTW
630+
} else {
631+
ExtendOp::SXTW
632+
};
633+
let extendee_input = InsnInput { insn, input: 0 };
634+
let reg = put_input_in_reg(ctx, extendee_input, NarrowValueMode::None);
635+
result32.push((reg, extendop));
636+
}
637+
Opcode::Uextend | Opcode::Sextend => {
638+
let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
639+
result64.push(reg);
640+
}
641+
Opcode::Iadd => {
642+
for input in 0..ctx.num_inputs(insn) {
643+
let addend = InsnInput { insn, input };
644+
workqueue.push(addend);
645+
}
646+
}
647+
Opcode::Iconst => {
648+
let value: i64 = ctx.get_constant(insn).unwrap() as i64;
649+
offset += value;
650+
}
651+
_ => panic!("Unexpected opcode from maybe_input_insn_multi"),
652+
}
653+
} else {
654+
let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64);
655+
result64.push(reg);
656+
}
657+
}
658+
659+
(result64, result32, offset)
660+
}
661+
576662
/// Lower the address of a load or store.
577663
pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
578664
ctx: &mut C,
579665
elem_ty: Type,
580-
addends: &[InsnInput],
666+
roots: &[InsnInput],
581667
offset: i32,
582668
) -> MemArg {
583669
// TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
584670
// mul instructions (Load/StoreComplex don't include scale factors).
585671

586-
// Handle one reg and offset.
587-
if addends.len() == 1 {
588-
let reg = put_input_in_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
589-
return MemArg::RegOffset(reg, offset as i64, elem_ty);
590-
}
672+
// Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
673+
// extends and addition ops. We update these as we consume address
674+
// components, so they represent the remaining addends not yet handled.
675+
let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
676+
let mut offset = args_offset + (offset as i64);
677+
678+
trace!(
679+
"lower_address: addends64 {:?}, addends32 {:?}, offset {}",
680+
addends64,
681+
addends32,
682+
offset
683+
);
591684

592-
// Handle two regs and a zero offset with built-in extend, if possible.
593-
if addends.len() == 2 && offset == 0 {
594-
// r1, r2 (to be extended), r2_bits, is_signed
595-
let mut parts: Option<(Reg, Reg, usize, bool)> = None;
596-
// Handle extension of either first or second addend.
597-
for i in 0..2 {
598-
if let Some((op, ext_insn)) =
599-
maybe_input_insn_multi(ctx, addends[i], &[Opcode::Uextend, Opcode::Sextend])
600-
{
601-
// Non-extended addend.
602-
let r1 = put_input_in_reg(ctx, addends[1 - i], NarrowValueMode::ZeroExtend64);
603-
// Extended addend.
604-
let r2 = put_input_in_reg(
605-
ctx,
606-
InsnInput {
607-
insn: ext_insn,
608-
input: 0,
609-
},
610-
NarrowValueMode::None,
611-
);
612-
let r2_bits = ty_bits(ctx.input_ty(ext_insn, 0));
613-
parts = Some((
614-
r1,
615-
r2,
616-
r2_bits,
617-
/* is_signed = */ op == Opcode::Sextend,
618-
));
619-
break;
620-
}
685+
// First, decide what the `MemArg` will be. Take one extendee and one 64-bit
686+
// reg, or two 64-bit regs, or a 64-bit reg and a 32-bit reg with extension,
687+
// or some other combination as appropriate.
688+
let memarg = if addends64.len() > 0 {
689+
if addends32.len() > 0 {
690+
let (reg32, extendop) = addends32.pop().unwrap();
691+
let reg64 = addends64.pop().unwrap();
692+
MemArg::RegExtended(reg64, reg32, extendop)
693+
} else if offset > 0 && offset < 0x1000 {
694+
let reg64 = addends64.pop().unwrap();
695+
let off = offset;
696+
offset = 0;
697+
MemArg::RegOffset(reg64, off, elem_ty)
698+
} else if addends64.len() >= 2 {
699+
let reg1 = addends64.pop().unwrap();
700+
let reg2 = addends64.pop().unwrap();
701+
MemArg::RegReg(reg1, reg2)
702+
} else {
703+
let reg1 = addends64.pop().unwrap();
704+
MemArg::reg(reg1)
621705
}
622-
623-
if let Some((r1, r2, r2_bits, is_signed)) = parts {
624-
match (r2_bits, is_signed) {
625-
(32, false) => {
626-
return MemArg::RegExtended(r1, r2, ExtendOp::UXTW);
627-
}
628-
(32, true) => {
629-
return MemArg::RegExtended(r1, r2, ExtendOp::SXTW);
630-
}
631-
_ => {}
706+
} else
707+
/* addends64.len() == 0 */
708+
{
709+
if addends32.len() > 0 {
710+
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
711+
let (reg1, extendop) = addends32.pop().unwrap();
712+
let signed = match extendop {
713+
ExtendOp::SXTW => true,
714+
ExtendOp::UXTW => false,
715+
_ => unreachable!(),
716+
};
717+
ctx.emit(Inst::Extend {
718+
rd: tmp,
719+
rn: reg1,
720+
signed,
721+
from_bits: 32,
722+
to_bits: 64,
723+
});
724+
if let Some((reg2, extendop)) = addends32.pop() {
725+
MemArg::RegExtended(tmp.to_reg(), reg2, extendop)
726+
} else {
727+
MemArg::reg(tmp.to_reg())
632728
}
729+
} else
730+
/* addends32.len() == 0 */
731+
{
732+
let off_reg = ctx.alloc_tmp(RegClass::I64, I64);
733+
lower_constant_u64(ctx, off_reg, offset as u64);
734+
offset = 0;
735+
MemArg::reg(off_reg.to_reg())
633736
}
634-
}
737+
};
635738

636-
// Handle two regs and a zero offset in the general case, if possible.
637-
if addends.len() == 2 && offset == 0 {
638-
let ra = put_input_in_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
639-
let rb = put_input_in_reg(ctx, addends[1], NarrowValueMode::ZeroExtend64);
640-
return MemArg::reg_plus_reg(ra, rb);
739+
// At this point, if we have any remaining components, we need to allocate a
740+
// temp, replace one of the registers in the MemArg with the temp, and emit
741+
// instructions to add together the remaining components. Return immediately
742+
// if this is *not* the case.
743+
if offset == 0 && addends32.len() == 0 && addends64.len() == 0 {
744+
return memarg;
641745
}
642746

643-
// Otherwise, generate add instructions.
747+
// Allocate the temp and shoehorn it into the MemArg.
644748
let addr = ctx.alloc_tmp(RegClass::I64, I64);
749+
let (reg, memarg) = match memarg {
750+
MemArg::RegExtended(r1, r2, extendop) => {
751+
(r1, MemArg::RegExtended(addr.to_reg(), r2, extendop))
752+
}
753+
MemArg::RegOffset(r, off, ty) => (r, MemArg::RegOffset(addr.to_reg(), off, ty)),
754+
MemArg::RegReg(r1, r2) => (r2, MemArg::RegReg(addr.to_reg(), r1)),
755+
MemArg::UnsignedOffset(r, imm) => (r, MemArg::UnsignedOffset(addr.to_reg(), imm)),
756+
_ => unreachable!(),
757+
};
645758

646-
// Get the const into a reg.
647-
lower_constant_u64(ctx, addr.clone(), offset as u64);
648-
649-
// Add each addend to the address.
650-
for addend in addends {
651-
let reg = put_input_in_reg(ctx, *addend, NarrowValueMode::ZeroExtend64);
759+
// If there is any offset, load that first into `addr`, and add the `reg`
760+
// that we kicked out of the `MemArg`; otherwise, start with that reg.
761+
if offset != 0 {
762+
// If we can fit offset or -offset in an imm12, use an add-imm
763+
// to combine the reg and offset. Otherwise, load value first then add.
764+
if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) {
765+
ctx.emit(Inst::AluRRImm12 {
766+
alu_op: ALUOp::Add64,
767+
rd: addr,
768+
rn: reg,
769+
imm12,
770+
});
771+
} else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) {
772+
ctx.emit(Inst::AluRRImm12 {
773+
alu_op: ALUOp::Sub64,
774+
rd: addr,
775+
rn: reg,
776+
imm12,
777+
});
778+
} else {
779+
lower_constant_u64(ctx, addr, offset as u64);
780+
ctx.emit(Inst::AluRRR {
781+
alu_op: ALUOp::Add64,
782+
rd: addr,
783+
rn: addr.to_reg(),
784+
rm: reg,
785+
});
786+
}
787+
} else {
788+
ctx.emit(Inst::gen_move(addr, reg, I64));
789+
}
652790

653-
// In an addition, the stack register is the zero register, so divert it to another
654-
// register just before doing the actual add.
791+
// Now handle reg64 and reg32-extended components.
792+
for reg in addends64 {
793+
// If the register is the stack reg, we must move it to another reg
794+
// before adding it.
655795
let reg = if reg == stack_reg() {
656796
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
657-
ctx.emit(Inst::Mov {
658-
rd: tmp,
659-
rm: stack_reg(),
660-
});
797+
ctx.emit(Inst::gen_move(tmp, stack_reg(), I64));
661798
tmp.to_reg()
662799
} else {
663800
reg
664801
};
665-
666802
ctx.emit(Inst::AluRRR {
667803
alu_op: ALUOp::Add64,
668-
rd: addr.clone(),
804+
rd: addr,
805+
rn: addr.to_reg(),
806+
rm: reg,
807+
});
808+
}
809+
for (reg, extendop) in addends32 {
810+
assert!(reg != stack_reg());
811+
ctx.emit(Inst::AluRRRExtend {
812+
alu_op: ALUOp::Add64,
813+
rd: addr,
669814
rn: addr.to_reg(),
670-
rm: reg.clone(),
815+
rm: reg,
816+
extendop,
671817
});
672818
}
673819

674-
MemArg::reg(addr.to_reg())
820+
memarg
675821
}
676822

677823
pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(

0 commit comments

Comments
 (0)