Skip to content

Commit e07d536

Browse files
committed
WIP return stack optimization
1 parent f9017fa commit e07d536

File tree

12 files changed

+329
-152
lines changed

12 files changed

+329
-152
lines changed

src/jit/assembler/arm/branch_assembler.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,16 @@ impl B {
2121
u4::new(cond as u8),
2222
))
2323
}
24+
25+
pub fn bl(imm: i32, cond: Cond) -> u32 {
26+
u32::from(B::new(
27+
// Extract first 24 bits, also keep msb
28+
u24::new((((imm << 8) >> 8) & 0xFFFFFF) as u32),
29+
u1::new(1),
30+
u3::new(0b101),
31+
u4::new(cond as u8),
32+
))
33+
}
2434
}
2535

2636
#[bitsize(32)]

src/jit/assembler/block_asm.rs

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use crate::jit::assembler::block_inst_list::BlockInstList;
55
use crate::jit::assembler::block_reg_set::BlockRegSet;
66
use crate::jit::assembler::{block_reg_allocator, BlockAsmBuf, BlockInst, BlockLabel, BlockOperand, BlockOperandShift, BlockReg, ANY_REG_LIMIT};
77
use crate::jit::inst_info::InstInfo;
8-
use crate::jit::reg::{Reg, RegReserve};
8+
use crate::jit::reg::{reg_reserve, Reg, RegReserve};
99
use crate::jit::{Cond, MemoryAmount, ShiftType};
1010
use crate::utils::{NoHashMap, NoHashSet};
1111

@@ -101,7 +101,7 @@ impl<'a> BlockAsm<'a> {
101101
instance.start_cond_block(Cond::NE);
102102
let host_sp_addr_reg = thread_regs_addr_reg;
103103
instance.mov(host_sp_addr_reg, host_sp_ptr as u32);
104-
instance.transfer_write(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0, false, MemoryAmount::Word);
104+
instance.store_u32(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0);
105105
instance.end_cond_block();
106106

107107
instance.sub(BlockReg::Fixed(Reg::SP), BlockReg::Fixed(Reg::SP), ANY_REG_LIMIT as u32 * 4); // Reserve for spilled registers
@@ -246,6 +246,30 @@ impl<'a> BlockAsm<'a> {
246246
})
247247
}
248248

249+
pub fn load_u8(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
250+
self.transfer_read(op0, op1, op2, false, MemoryAmount::Byte)
251+
}
252+
253+
pub fn store_u8(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
254+
self.transfer_write(op0, op1, op2, false, MemoryAmount::Byte)
255+
}
256+
257+
pub fn load_u16(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
258+
self.transfer_read(op0, op1, op2, false, MemoryAmount::Half)
259+
}
260+
261+
pub fn store_u16(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
262+
self.transfer_write(op0, op1, op2, false, MemoryAmount::Half)
263+
}
264+
265+
pub fn load_u32(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
266+
self.transfer_read(op0, op1, op2, false, MemoryAmount::Word)
267+
}
268+
269+
pub fn store_u32(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>) {
270+
self.transfer_write(op0, op1, op2, false, MemoryAmount::Word)
271+
}
272+
249273
pub fn transfer_read(&mut self, op0: impl Into<BlockReg>, op1: impl Into<BlockReg>, op2: impl Into<BlockOperandShift>, signed: bool, amount: MemoryAmount) {
250274
self.transfer(BlockTransferOp::Read, op0, op1, op2, signed, amount)
251275
}
@@ -347,6 +371,7 @@ impl<'a> BlockAsm<'a> {
347371
cond,
348372
block_index: 0,
349373
skip: false,
374+
has_return: false,
350375
})
351376
}
352377

@@ -378,7 +403,12 @@ impl<'a> BlockAsm<'a> {
378403
pub fn epilogue(&mut self) {
379404
let host_sp_addr_reg = self.thread_regs_addr_reg;
380405
self.mov(host_sp_addr_reg, self.host_sp_ptr as u32);
381-
self.transfer_read(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0, false, MemoryAmount::Word);
406+
self.load_u32(BlockReg::Fixed(Reg::SP), host_sp_addr_reg, 0);
407+
self.buf.insts.push(BlockInst::Epilogue);
408+
}
409+
410+
pub fn epilogue_previous_block(&mut self) {
411+
self.add(BlockReg::Fixed(Reg::SP), BlockReg::Fixed(Reg::SP), ANY_REG_LIMIT as u32 * 4);
382412
self.buf.insts.push(BlockInst::Epilogue);
383413
}
384414

@@ -440,6 +470,9 @@ impl<'a> BlockAsm<'a> {
440470
}
441471
}
442472
self.mov(self.tmp_func_call_reg, func.into());
473+
if has_return {
474+
self.transfer_push(BlockReg::Fixed(Reg::SP), reg_reserve!(Reg::LR));
475+
}
443476
self.insert_inst(BlockInst::Call {
444477
func_reg: self.tmp_func_call_reg,
445478
args: [
@@ -450,6 +483,9 @@ impl<'a> BlockAsm<'a> {
450483
],
451484
has_return,
452485
});
486+
if has_return {
487+
self.transfer_pop(BlockReg::Fixed(Reg::SP), reg_reserve!(Reg::LR));
488+
}
453489
}
454490

455491
pub fn bkpt(&mut self, id: u16) {
@@ -471,7 +507,7 @@ impl<'a> BlockAsm<'a> {
471507
self.insert_inst(BlockInst::GuestPc(pc));
472508
}
473509

474-
pub fn guest_branch(&mut self, cond: Cond, target_pc: u32) {
510+
pub fn guest_branch(&mut self, cond: Cond, target_pc: u32, has_return: bool) {
475511
let label = match self.buf.guest_branches_mapping.get(&target_pc) {
476512
None => {
477513
let label = self.new_label();
@@ -485,6 +521,7 @@ impl<'a> BlockAsm<'a> {
485521
cond,
486522
block_index: 0,
487523
skip: false,
524+
has_return,
488525
});
489526
}
490527

@@ -850,10 +887,11 @@ impl<'a> BlockAsm<'a> {
850887
for branch_placeholder in branch_placeholders {
851888
let opcode = opcodes[branch_placeholder];
852889
let cond = Cond::from((opcode >> 28) as u8);
853-
let block_index = opcode & 0xFFFFFFF;
890+
let has_return = (opcode >> 27) & 1 == 1;
891+
let block_index = opcode & 0x7FFFFFF;
854892
let branch_to = opcodes_offset[block_index as usize];
855893
let diff = branch_to as i32 - branch_placeholder as i32;
856-
opcodes[branch_placeholder] = B::b(diff - 2, cond);
894+
opcodes[branch_placeholder] = if has_return { B::bl(diff - 2, cond) } else { B::b(diff - 2, cond) };
857895
}
858896

859897
opcodes

src/jit/assembler/block_inst.rs

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ impl BlockInst {
126126
BlockInst::Bfc { operand, .. } => (block_reg_set!(Some(*operand)), block_reg_set!(Some(*operand))),
127127
BlockInst::Bfi { operands, .. } => (block_reg_set!(Some(operands[0]), Some(operands[1])), block_reg_set!(Some(operands[0]))),
128128

129+
BlockInst::Branch { has_return, .. } => (block_reg_set!(), block_reg_set!(if *has_return { Some(BlockReg::Fixed(Reg::LR)) } else { None })),
130+
129131
BlockInst::SaveContext { .. } => (block_reg_set!(), block_reg_set!()),
130132
BlockInst::SaveReg {
131133
guest_reg,
@@ -164,7 +166,7 @@ impl BlockInst {
164166
(block_reg_set!(Some(*thread_regs_addr_reg)), outputs)
165167
}
166168

167-
BlockInst::Call { func_reg, args, .. } => {
169+
BlockInst::Call { func_reg, args, has_return } => {
168170
let mut inputs = BlockRegSet::new();
169171
inputs += *func_reg;
170172
for arg in args {
@@ -180,7 +182,8 @@ impl BlockInst {
180182
Some(BlockReg::Fixed(Reg::R2)),
181183
Some(BlockReg::Fixed(Reg::R3)),
182184
Some(BlockReg::Fixed(Reg::R12)),
183-
Some(BlockReg::Fixed(Reg::CPSR))
185+
Some(BlockReg::Fixed(Reg::CPSR)),
186+
if *has_return { Some(BlockReg::Fixed(Reg::LR)) } else { None }
184187
),
185188
)
186189
}
@@ -205,7 +208,7 @@ impl BlockInst {
205208
block_reg_set!(Some(BlockReg::Fixed(Reg::SP)), Some(BlockReg::Fixed(Reg::PC))),
206209
),
207210

208-
BlockInst::Label { .. } | BlockInst::Branch { .. } | BlockInst::GuestPc(_) | BlockInst::Bkpt(_) => (block_reg_set!(), block_reg_set!()),
211+
BlockInst::Label { .. } | BlockInst::GuestPc(_) | BlockInst::Bkpt(_) => (block_reg_set!(), block_reg_set!()),
209212
}
210213
}
211214

@@ -542,11 +545,13 @@ impl BlockInst {
542545
}
543546
},
544547

545-
BlockInst::Branch { cond, block_index, skip, .. } => {
548+
BlockInst::Branch {
549+
cond, block_index, skip, has_return, ..
550+
} => {
546551
if !*skip {
547552
// Encode label and cond as u32
548553
// Branch offset can only be figured out later
549-
opcodes.push(((*cond as u32) << 28) | (*block_index as u32));
554+
opcodes.push(((*cond as u32) << 28) | ((*has_return as u32) << 27) | (*block_index as u32));
550555
branch_placeholders.push(opcodes_offset + opcode_index);
551556
}
552557
}
@@ -717,6 +722,7 @@ pub enum BlockInst {
717722
cond: Cond,
718723
block_index: usize,
719724
skip: bool,
725+
has_return: bool,
720726
},
721727

722728
SaveContext {
@@ -814,7 +820,19 @@ impl Debug for BlockInst {
814820
};
815821
write!(f, "label {label:?} {guest_pc}:")
816822
}
817-
BlockInst::Branch { label, cond, block_index, skip } => write!(f, "B{cond:?} {label:?}, block index: {block_index}, skip: {skip}"),
823+
BlockInst::Branch {
824+
label,
825+
cond,
826+
block_index,
827+
skip,
828+
has_return,
829+
} => {
830+
if *has_return {
831+
write!(f, "Bl{cond:?} {label:?}, block index: {block_index}, skip: {skip}")
832+
} else {
833+
write!(f, "B{cond:?} {label:?}, block index: {block_index}, skip: {skip}")
834+
}
835+
}
818836
BlockInst::SaveContext { .. } => write!(f, "SaveContext"),
819837
BlockInst::SaveReg { guest_reg, reg_mapped, .. } => write!(f, "SaveReg {guest_reg:?}, mapped: {reg_mapped:?}"),
820838
BlockInst::RestoreReg { guest_reg, reg_mapped, .. } => write!(f, "RestoreReg {guest_reg:?}, mapped: {reg_mapped:?}"),

src/jit/disassembler/branch_instructions.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ mod branch_ops {
2727
Op::Blx,
2828
Operands::new_1(Operand::imm(op0 as u32)),
2929
reg_reserve!(),
30-
reg_reserve!(),
30+
reg_reserve!(Reg::LR, Reg::CPSR),
3131
1,
3232
)
3333
} else {
@@ -47,7 +47,7 @@ mod branch_ops {
4747
Op::Blx,
4848
Operands::new_1(Operand::imm(op0 as u32)),
4949
reg_reserve!(),
50-
reg_reserve!(),
50+
reg_reserve!(Reg::LR, Reg::CPSR),
5151
1,
5252
)
5353
} else {

src/jit/emitter/emit.rs

Lines changed: 32 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
use crate::core::CpuType;
22
use crate::core::CpuType::ARM7;
33
use crate::jit::assembler::block_asm::BlockAsm;
4-
use crate::jit::assembler::BlockReg;
4+
use crate::jit::assembler::{BlockLabel, BlockReg};
55
use crate::jit::inst_threag_regs_handler::{register_restore_spsr, restore_thumb_after_restore_spsr, set_pc_arm_mode};
66
use crate::jit::jit_asm::{JitAsm, JitRuntimeData};
77
use crate::jit::op::Op;
88
use crate::jit::reg::Reg;
9-
use crate::jit::{Cond, MemoryAmount};
9+
use crate::jit::Cond;
1010
use crate::DEBUG_LOG_BRANCH_OUT;
1111
use CpuType::ARM9;
1212

@@ -60,6 +60,16 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> {
6060
block_asm.call(restore_thumb_after_restore_spsr::<CPU> as *const ());
6161
}
6262

63+
if (op.is_mov() && self.jit_buf.current_inst().src_regs.is_reserved(Reg::LR) && !self.jit_buf.current_inst().out_regs.is_reserved(Reg::CPSR))
64+
|| (op.is_multiple_mem_transfer() && *self.jit_buf.current_inst().operands()[0].as_reg_no_shift().unwrap() == Reg::SP)
65+
|| (op.is_single_mem_transfer() && self.jit_buf.current_inst().src_regs.is_reserved(Reg::SP))
66+
{
67+
let guest_pc_reg = block_asm.new_reg();
68+
block_asm.load_u32(guest_pc_reg, block_asm.thread_regs_addr_reg, Reg::PC as u32 * 4);
69+
self.emit_branch_return_stack_common(block_asm, guest_pc_reg);
70+
block_asm.free_reg(guest_pc_reg);
71+
}
72+
6373
self.emit_branch_out_metadata(block_asm);
6474
block_asm.epilogue();
6575
}
@@ -77,15 +87,15 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> {
7787
if DEBUG_LOG_BRANCH_OUT {
7888
let pc_reg = block_asm.new_reg();
7989
block_asm.mov(pc_reg, self.jit_buf.current_pc);
80-
block_asm.transfer_write(pc_reg, runtime_data_addr_reg, JitRuntimeData::get_out_pc_offset() as u32, false, MemoryAmount::Word);
90+
block_asm.store_u32(pc_reg, runtime_data_addr_reg, JitRuntimeData::get_out_pc_offset() as u32);
8191

8292
block_asm.free_reg(pc_reg);
8393
}
84-
block_asm.transfer_write(total_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_out_total_cycles_offset() as u32, false, MemoryAmount::Word);
94+
block_asm.store_u32(total_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_out_total_cycles_offset() as u32);
8595
if set_idle_loop {
8696
let idle_loop_reg = block_asm.new_reg();
8797
block_asm.mov(idle_loop_reg, 1);
88-
block_asm.transfer_write(idle_loop_reg, runtime_data_addr_reg, JitRuntimeData::get_idle_loop_offset() as u32, false, MemoryAmount::Byte);
98+
block_asm.store_u8(idle_loop_reg, runtime_data_addr_reg, JitRuntimeData::get_idle_loop_offset() as u32);
8999

90100
block_asm.free_reg(idle_loop_reg);
91101
}
@@ -102,73 +112,55 @@ impl<'a, const CPU: CpuType> JitAsm<'a, CPU> {
102112
self._emit_branch_out_metadata(block_asm, true)
103113
}
104114

105-
pub fn emit_flush_cycles<ContinueFn: Fn(&mut Self, &mut BlockAsm, BlockReg), BreakoutFn: Fn(&mut Self, &mut BlockAsm)>(
115+
pub fn emit_flush_cycles<ContinueFn: Fn(&mut Self, &mut BlockAsm, BlockReg, BlockLabel), BreakoutFn: Fn(&mut Self, &mut BlockAsm)>(
106116
&mut self,
107117
block_asm: &mut BlockAsm,
108-
target_pre_cycle_count_sum: u16,
118+
target_pre_cycle_count_sum: Option<u16>,
109119
continue_fn: ContinueFn,
110120
breakout_fn: BreakoutFn,
111121
) {
112122
let runtime_data_addr_reg = block_asm.new_reg();
113123
block_asm.mov(runtime_data_addr_reg, self.runtime_data.get_addr() as u32);
114124

115125
let accumulated_cycles_reg = block_asm.new_reg();
116-
block_asm.transfer_read(
117-
accumulated_cycles_reg,
118-
runtime_data_addr_reg,
119-
JitRuntimeData::get_accumulated_cycles_offset() as u32,
120-
false,
121-
MemoryAmount::Half,
122-
);
126+
block_asm.load_u16(accumulated_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_accumulated_cycles_offset() as u32);
123127

124128
let pre_cycle_count_sum_reg = block_asm.new_reg();
125-
block_asm.transfer_read(
126-
pre_cycle_count_sum_reg,
127-
runtime_data_addr_reg,
128-
JitRuntimeData::get_pre_cycle_count_sum_offset() as u32,
129-
false,
130-
MemoryAmount::Half,
131-
);
129+
block_asm.load_u16(pre_cycle_count_sum_reg, runtime_data_addr_reg, JitRuntimeData::get_pre_cycle_count_sum_offset() as u32);
132130

133131
let total_cycles_reg = block_asm.new_reg();
134132
// +2 for branching
135133
block_asm.add(total_cycles_reg, accumulated_cycles_reg, self.jit_buf.insts_cycle_counts[self.jit_buf.current_index] as u32 + 2);
136134
block_asm.sub(total_cycles_reg, total_cycles_reg, pre_cycle_count_sum_reg);
137135

138-
const MAX_LOOP_CYCLE_COUNT: u32 = 256;
136+
const MAX_LOOP_CYCLE_COUNT: u32 = 255;
139137
block_asm.cmp(
140138
total_cycles_reg,
141139
match CPU {
142140
ARM9 => MAX_LOOP_CYCLE_COUNT * 2,
143141
ARM7 => MAX_LOOP_CYCLE_COUNT,
144-
} - 1,
142+
},
145143
);
146144

145+
let continue_label = block_asm.new_label();
147146
let breakout_label = block_asm.new_label();
148-
block_asm.branch(breakout_label, Cond::HI);
147+
block_asm.branch(breakout_label, Cond::HS);
149148

150-
block_asm.transfer_write(
151-
total_cycles_reg,
152-
runtime_data_addr_reg,
153-
JitRuntimeData::get_accumulated_cycles_offset() as u32,
154-
false,
155-
MemoryAmount::Half,
156-
);
149+
block_asm.store_u16(total_cycles_reg, runtime_data_addr_reg, JitRuntimeData::get_accumulated_cycles_offset() as u32);
157150

158151
let target_pre_cycle_count_sum_reg = block_asm.new_reg();
159-
block_asm.mov(target_pre_cycle_count_sum_reg, target_pre_cycle_count_sum as u32);
160-
block_asm.transfer_write(
161-
target_pre_cycle_count_sum_reg,
162-
runtime_data_addr_reg,
163-
JitRuntimeData::get_pre_cycle_count_sum_offset() as u32,
164-
false,
165-
MemoryAmount::Half,
166-
);
167-
continue_fn(self, block_asm, runtime_data_addr_reg);
152+
if let Some(target_pre_cycle_count_sum) = target_pre_cycle_count_sum {
153+
block_asm.mov(target_pre_cycle_count_sum_reg, target_pre_cycle_count_sum as u32);
154+
block_asm.store_u16(target_pre_cycle_count_sum_reg, runtime_data_addr_reg, JitRuntimeData::get_pre_cycle_count_sum_offset() as u32);
155+
}
156+
continue_fn(self, block_asm, runtime_data_addr_reg, breakout_label);
157+
block_asm.branch(continue_label, Cond::AL);
168158

169159
block_asm.label(breakout_label);
170160
breakout_fn(self, block_asm);
171161

162+
block_asm.label(continue_label);
163+
172164
block_asm.free_reg(target_pre_cycle_count_sum_reg);
173165
block_asm.free_reg(total_cycles_reg);
174166
block_asm.free_reg(pre_cycle_count_sum_reg);

0 commit comments

Comments
 (0)