diff --git a/Cargo.lock b/Cargo.lock index 95124ac6fe..d3123caaa4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -121,6 +121,26 @@ dependencies = [ "serde", ] +[[package]] +name = "capstone" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "015ef5d5ca1743e3f94af9509ba6bd2886523cfee46e48d15c2ef5216fd4ac9a" +dependencies = [ + "capstone-sys", + "libc", +] + +[[package]] +name = "capstone-sys" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2267cb8d16a1e4197863ec4284ffd1aec26fe7e57c58af46b02590a0235809a0" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "cargo-platform" version = "0.1.9" @@ -591,6 +611,7 @@ version = "0.1.0" dependencies = [ "aes", "bitflags", + "capstone", "chrono", "chrono-tz", "colored", diff --git a/Cargo.toml b/Cargo.toml index 76018efc4a..0b4b8e26bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ libloading = "0.8" nix = { version = "0.30.1", features = ["mman", "ptrace", "signal"] } ipc-channel = "0.19.0" serde = { version = "1.0.219", features = ["derive"] } +capstone = "0.13" [dev-dependencies] ui_test = "0.29.1" diff --git a/src/alloc/isolated_alloc.rs b/src/alloc/isolated_alloc.rs index 21aead329a..ef3dd98233 100644 --- a/src/alloc/isolated_alloc.rs +++ b/src/alloc/isolated_alloc.rs @@ -1,5 +1,7 @@ use std::alloc::Layout; +use std::ptr::NonNull; +use nix::sys::mman; use rustc_index::bit_set::DenseBitSet; /// How many bytes of memory each bit in the bitset represents. @@ -12,7 +14,7 @@ pub struct IsolatedAlloc { /// Pointers to page-aligned memory that has been claimed by the allocator. /// Every pointer here must point to a page-sized allocation claimed via /// mmap. These pointers are used for "small" allocations. - page_ptrs: Vec<*mut u8>, + page_ptrs: Vec>, /// Metadata about which bytes have been allocated on each page. The length /// of this vector must be the same as that of `page_ptrs`, and the domain /// size of the bitset must be exactly `page_size / COMPRESSION_FACTOR`. @@ -24,7 +26,7 @@ pub struct IsolatedAlloc { page_infos: Vec>, /// Pointers to multiple-page-sized allocations. These must also be page-aligned, /// with their size stored as the second element of the vector. - huge_ptrs: Vec<(*mut u8, usize)>, + huge_ptrs: Vec<(NonNull, usize)>, /// The host (not emulated) page size. page_size: usize, } @@ -137,7 +139,7 @@ impl IsolatedAlloc { unsafe fn alloc_small( page_size: usize, layout: Layout, - page: *mut u8, + page: NonNull, pinfo: &mut DenseBitSet, zeroed: bool, ) -> Option<*mut u8> { @@ -164,7 +166,7 @@ impl IsolatedAlloc { // zero out, even if we allocated more ptr.write_bytes(0, layout.size()); } - return Some(ptr); + return Some(ptr.as_ptr()); } } } @@ -172,7 +174,7 @@ impl IsolatedAlloc { } /// Expands the available memory pool by adding one page. - fn add_page(&mut self) -> (*mut u8, &mut DenseBitSet) { + fn add_page(&mut self) -> (NonNull, &mut DenseBitSet) { // SAFETY: mmap is always safe to call when requesting anonymous memory let page_ptr = unsafe { libc::mmap( @@ -189,8 +191,8 @@ impl IsolatedAlloc { // `page_infos` has to have one bit for each `COMPRESSION_FACTOR`-sized chunk of bytes in the page. assert!(self.page_size % COMPRESSION_FACTOR == 0); self.page_infos.push(DenseBitSet::new_empty(self.page_size / COMPRESSION_FACTOR)); - self.page_ptrs.push(page_ptr); - (page_ptr, self.page_infos.last_mut().unwrap()) + self.page_ptrs.push(NonNull::new(page_ptr).unwrap()); + (NonNull::new(page_ptr).unwrap(), self.page_infos.last_mut().unwrap()) } /// Allocates in multiples of one page on the host system. @@ -212,7 +214,7 @@ impl IsolatedAlloc { .cast::() }; assert_ne!(ret.addr(), usize::MAX, "mmap failed"); - self.huge_ptrs.push((ret, size)); + self.huge_ptrs.push((NonNull::new(ret).unwrap(), size)); // huge_normalized_layout ensures that we've overallocated enough space // for this to be valid. ret.map_addr(|a| a.next_multiple_of(layout.align())) @@ -246,7 +248,7 @@ impl IsolatedAlloc { // from us pointing to this page, and we know it was allocated // in add_page as exactly a single page. unsafe { - assert_eq!(libc::munmap(page_ptr.cast(), self.page_size), 0); + assert_eq!(libc::munmap(page_ptr.as_ptr().cast(), self.page_size), 0); } } } @@ -265,7 +267,7 @@ impl IsolatedAlloc { // This could be made faster if the list was sorted -- the allocator isn't fully optimized at the moment. let pinfo = std::iter::zip(&mut self.page_ptrs, &mut self.page_infos) .enumerate() - .find(|(_, (page, _))| page.addr() == page_addr); + .find(|(_, (page, _))| page.addr().get() == page_addr); let Some((idx_of_pinfo, (_, pinfo))) = pinfo else { panic!("Freeing in an unallocated page: {ptr:?}\nHolding pages {:?}", self.page_ptrs) }; @@ -287,7 +289,7 @@ impl IsolatedAlloc { .huge_ptrs .iter() .position(|&(pg, size)| { - pg.addr() <= ptr.addr() && ptr.addr() < pg.addr().strict_add(size) + pg.addr().get() <= ptr.addr() && ptr.addr() < pg.addr().get().strict_add(size) }) .expect("Freeing unallocated pages"); // And kick it from the list @@ -295,22 +297,59 @@ impl IsolatedAlloc { assert_eq!(size, size2, "got wrong layout in dealloc"); // SAFETY: huge_ptrs contains allocations made with mmap with the size recorded there. unsafe { - let ret = libc::munmap(un_offset_ptr.cast(), size); + let ret = libc::munmap(un_offset_ptr.as_ptr().cast(), size); assert_eq!(ret, 0); } } /// Returns a vector of page addresses managed by the allocator. pub fn pages(&self) -> Vec { - let mut pages: Vec<_> = - self.page_ptrs.clone().into_iter().map(|p| p.expose_provenance()).collect(); - for (ptr, size) in &self.huge_ptrs { + let mut pages: Vec = + self.page_ptrs.clone().into_iter().map(|p| p.expose_provenance().get()).collect(); + self.huge_ptrs.iter().for_each(|(ptr, size)| { for i in 0..size / self.page_size { - pages.push(ptr.expose_provenance().strict_add(i * self.page_size)); + pages.push(ptr.expose_provenance().get().strict_add(i * self.page_size)); } - } + }); pages } + + /// Protects all owned memory as `PROT_NONE`, preventing accesses. + /// + /// SAFETY: Accessing memory after this point will result in a segfault + /// unless it is first unprotected. + pub unsafe fn prepare_ffi(&mut self) -> Result<(), nix::errno::Errno> { + let prot = mman::ProtFlags::PROT_NONE; + unsafe { self.mprotect(prot) } + } + + /// Deprotects all owned memory by setting it to RW. Erroring here is very + /// likely unrecoverable, so it may panic if applying those permissions + /// fails. + pub fn unprep_ffi(&mut self) { + let prot = mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE; + unsafe { + self.mprotect(prot).unwrap(); + } + } + + /// Applies `prot` to every page managed by the allocator. + /// + /// SAFETY: Accessing memory in violation of the protection flags will + /// trigger a segfault. + unsafe fn mprotect(&mut self, prot: mman::ProtFlags) -> Result<(), nix::errno::Errno> { + for &pg in &self.page_ptrs { + unsafe { + mman::mprotect(pg.cast(), self.page_size, prot)?; + } + } + for &(hpg, size) in &self.huge_ptrs { + unsafe { + mman::mprotect(hpg.cast(), size.next_multiple_of(self.page_size), prot)?; + } + } + Ok(()) + } } #[cfg(test)] diff --git a/src/shims/native_lib.rs b/src/shims/native_lib.rs index 53d1de060f..3e455e1738 100644 --- a/src/shims/native_lib.rs +++ b/src/shims/native_lib.rs @@ -229,7 +229,14 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { .collect::>>(); // Call the function and store output, depending on return type in the function signature. - let (ret, _) = this.call_native_with_args(link_name, dest, code_ptr, libffi_args)?; + let (ret, maybe_memevents) = + this.call_native_with_args(link_name, dest, code_ptr, libffi_args)?; + + if cfg!(target_os = "linux") + && let Some(events) = maybe_memevents + { + trace!("Registered FFI events:\n{events:#0x?}"); + } this.write_immediate(*ret, dest)?; interp_ok(true) @@ -250,15 +257,15 @@ unsafe fn do_native_call( unsafe { if let Some(alloc) = alloc { - // SAFETY: We don't touch the machine memory past this point + // SAFETY: We don't touch the machine memory past this point. let (guard, stack_ptr) = Supervisor::start_ffi(alloc.clone()); - // SAFETY: Upheld by caller + // SAFETY: Upheld by caller. let ret = ffi::call(ptr, args); // SAFETY: We got the guard and stack pointer from start_ffi, and - // the allocator is the same + // the allocator is the same. (ret, Supervisor::end_ffi(guard, alloc, stack_ptr)) } else { - // SAFETY: Upheld by caller + // SAFETY: Upheld by caller. (ffi::call(ptr, args), None) } } diff --git a/src/shims/trace/child.rs b/src/shims/trace/child.rs index dcfdaad748..c320537b94 100644 --- a/src/shims/trace/child.rs +++ b/src/shims/trace/child.rs @@ -52,30 +52,40 @@ impl Supervisor { // If the supervisor is not initialised for whatever reason, fast-fail. // This might be desired behaviour, as even on platforms where ptracing // is not implemented it enables us to enforce that only one FFI call - // happens at a time + // happens at a time. let Some(sv) = sv_guard.take() else { return (sv_guard, None); }; // Get pointers to all the pages the supervisor must allow accesses in - // and prepare the fake stack + // and prepare the fake stack. let page_ptrs = alloc.borrow().pages(); let raw_stack_ptr: *mut [u8; FAKE_STACK_SIZE] = Box::leak(Box::new([0u8; FAKE_STACK_SIZE])).as_mut_ptr().cast(); let stack_ptr = raw_stack_ptr.expose_provenance(); let start_info = StartFfiInfo { page_ptrs, stack_ptr }; + // SAFETY: We do not access machine memory past this point until the + // supervisor is ready to allow it. + unsafe { + if alloc.borrow_mut().prepare_ffi().is_err() { + // Don't mess up unwinding by maybe leaving the memory partly protected + alloc.borrow_mut().unprep_ffi(); + panic!("Cannot protect memory for FFI call!"); + } + } + // Send over the info. // NB: if we do not wait to receive a blank confirmation response, it is // possible that the supervisor is alerted of the SIGSTOP *before* it has // actually received the start_info, thus deadlocking! This way, we can - // enforce an ordering for these events + // enforce an ordering for these events. sv.message_tx.send(TraceRequest::StartFfi(start_info)).unwrap(); sv.confirm_rx.recv().unwrap(); *sv_guard = Some(sv); // We need to be stopped for the supervisor to be able to make certain // modifications to our memory - simply waiting on the recv() doesn't - // count + // count. signal::raise(signal::SIGSTOP).unwrap(); (sv_guard, Some(raw_stack_ptr)) } @@ -90,7 +100,7 @@ impl Supervisor { /// one passed to it also. pub unsafe fn end_ffi( mut sv_guard: std::sync::MutexGuard<'static, Option>, - _alloc: Rc>, + alloc: Rc>, raw_stack_ptr: Option<*mut [u8; FAKE_STACK_SIZE]>, ) -> Option { // We can't use IPC channels here to signal that FFI mode has ended, @@ -99,19 +109,22 @@ impl Supervisor { // simpler and more robust to simply use the signals which are left for // arbitrary usage. Since this will block until we are continued by the // supervisor, we can assume past this point that everything is back to - // normal + // normal. signal::raise(signal::SIGUSR1).unwrap(); + // This is safe! It just sets memory to normal expected permissions. + alloc.borrow_mut().unprep_ffi(); + // If this is `None`, then `raw_stack_ptr` is None and does not need to // be deallocated (and there's no need to worry about the guard, since - // it contains nothing) + // it contains nothing). let sv = sv_guard.take()?; // SAFETY: Caller upholds that this pointer was allocated as a box with - // this type + // this type. unsafe { drop(Box::from_raw(raw_stack_ptr.unwrap())); } - // On the off-chance something really weird happens, don't block forever + // On the off-chance something really weird happens, don't block forever. let ret = sv .event_rx .try_recv_timeout(std::time::Duration::from_secs(5)) @@ -138,33 +151,34 @@ impl Supervisor { /// The invariants for `fork()` must be upheld by the caller. pub unsafe fn init_sv() -> Result<(), SvInitError> { // FIXME: Much of this could be reimplemented via the mitosis crate if we upstream the - // relevant missing bits + // relevant missing bits. // On Linux, this will check whether ptrace is fully disabled by the Yama module. // If Yama isn't running or we're not on Linux, we'll still error later, but - // this saves a very expensive fork call + // this saves a very expensive fork call. let ptrace_status = std::fs::read_to_string("/proc/sys/kernel/yama/ptrace_scope"); if let Ok(stat) = ptrace_status { if let Some(stat) = stat.chars().next() { - // Fast-error if ptrace is fully disabled on the system + // Fast-error if ptrace is fully disabled on the system. if stat == '3' { return Err(SvInitError); } } } - // Initialise the supervisor if it isn't already, placing it into SUPERVISOR + // Initialise the supervisor if it isn't already, placing it into SUPERVISOR. let mut lock = SUPERVISOR.lock().unwrap(); if lock.is_some() { return Ok(()); } - // Prepare the IPC channels we need + // Prepare the IPC channels we need. let (message_tx, message_rx) = ipc::channel().unwrap(); let (confirm_tx, confirm_rx) = ipc::channel().unwrap(); let (event_tx, event_rx) = ipc::channel().unwrap(); - // SAFETY: Calling sysconf(_SC_PAGESIZE) is always safe and cannot error + // SAFETY: Calling sysconf(_SC_PAGESIZE) is always safe and cannot error. let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }.try_into().unwrap(); + super::parent::PAGE_SIZE.store(page_size, std::sync::atomic::Ordering::Relaxed); unsafe { // TODO: Maybe use clone3() instead for better signalling of when the child exits? @@ -172,37 +186,36 @@ pub unsafe fn init_sv() -> Result<(), SvInitError> { match unistd::fork().unwrap() { unistd::ForkResult::Parent { child } => { // If somehow another thread does exist, prevent it from accessing the lock - // and thus breaking our safety invariants + // and thus breaking our safety invariants. std::mem::forget(lock); // The child process is free to unwind, so we won't to avoid doubly freeing - // system resources + // system resources. let init = std::panic::catch_unwind(|| { let listener = ChildListener { message_rx, attached: false, override_retcode: None }; - // Trace as many things as possible, to be able to handle them as needed + // Trace as many things as possible, to be able to handle them as needed. let options = ptrace::Options::PTRACE_O_TRACESYSGOOD | ptrace::Options::PTRACE_O_TRACECLONE | ptrace::Options::PTRACE_O_TRACEFORK; - // Attach to the child process without stopping it + // Attach to the child process without stopping it. match ptrace::seize(child, options) { // Ptrace works :D Ok(_) => { - let code = sv_loop(listener, child, event_tx, confirm_tx, page_size) - .unwrap_err(); + let code = sv_loop(listener, child, event_tx, confirm_tx).unwrap_err(); // If a return code of 0 is not explicitly given, assume something went - // wrong and return 1 - std::process::exit(code.unwrap_or(1)) + // wrong and return 1. + std::process::exit(code.0.unwrap_or(1)) } - // Ptrace does not work and we failed to catch that + // Ptrace does not work and we failed to catch that. Err(_) => { - // If we can't ptrace, Miri continues being the parent + // If we can't ptrace, Miri continues being the parent. signal::kill(child, signal::SIGKILL).unwrap(); SvInitError } } }); match init { - // The "Ok" case means that we couldn't ptrace + // The "Ok" case means that we couldn't ptrace. Ok(e) => return Err(e), Err(p) => { eprintln!("Supervisor process panicked!\n{p:?}"); @@ -212,12 +225,12 @@ pub unsafe fn init_sv() -> Result<(), SvInitError> { } unistd::ForkResult::Child => { // Make sure we never get orphaned and stuck in SIGSTOP or similar - // SAFETY: prctl PR_SET_PDEATHSIG is always safe to call + // SAFETY: prctl PR_SET_PDEATHSIG is always safe to call. let ret = libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); assert_eq!(ret, 0); // First make sure the parent succeeded with ptracing us! signal::raise(signal::SIGSTOP).unwrap(); - // If we're the child process, save the supervisor info + // If we're the child process, save the supervisor info. *lock = Some(Supervisor { message_tx, confirm_rx, event_rx }); } } diff --git a/src/shims/trace/parent.rs b/src/shims/trace/parent.rs index 5eb8aa8465..cb9a9fc8da 100644 --- a/src/shims/trace/parent.rs +++ b/src/shims/trace/parent.rs @@ -1,16 +1,108 @@ +use std::sync::atomic::{AtomicPtr, AtomicUsize}; + use ipc_channel::ipc; use nix::sys::{ptrace, signal, wait}; use nix::unistd; -use super::StartFfiInfo; -use super::messages::{Confirmation, MemEvents, TraceRequest}; +use crate::shims::trace::messages::{Confirmation, MemEvents, TraceRequest}; +use crate::shims::trace::{AccessEvent, FAKE_STACK_SIZE, StartFfiInfo}; /// The flags to use when calling `waitid()`. -/// Since bitwise OR on the nix version of these flags is implemented as a trait, -/// we can't use them directly so we do it this way +/// Since bitwise or on the nix version of these flags is implemented as a trait, +/// this cannot be const directly so we do it this way. const WAIT_FLAGS: wait::WaitPidFlag = wait::WaitPidFlag::from_bits_truncate(libc::WUNTRACED | libc::WEXITED); +/// Arch-specific maximum size a single access might perform. x86 value is set +/// assuming nothing bigger than AVX-512 is available. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +const ARCH_MAX_ACCESS_SIZE: usize = 64; +/// The largest arm64 simd instruction operates on 16 bytes. +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +const ARCH_MAX_ACCESS_SIZE: usize = 16; +/// The max riscv vector instruction can access 8 consecutive 32-bit values. +#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +const ARCH_MAX_ACCESS_SIZE: usize = 32; + +/// The default word size on a given platform, in bytes. +#[cfg(any(target_arch = "x86", target_arch = "arm", target_arch = "riscv32"))] +const ARCH_WORD_SIZE: usize = 4; +#[cfg(any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "riscv64"))] +const ARCH_WORD_SIZE: usize = 8; + +/// The address of the page set to be edited, initialised to a sentinel null +/// pointer. +static PAGE_ADDR: AtomicPtr = AtomicPtr::new(std::ptr::null_mut()); +/// The host pagesize, initialised to a sentinel zero value. +pub static PAGE_SIZE: AtomicUsize = AtomicUsize::new(0); +/// How many consecutive pages to unprotect. 1 by default, unlikely to be set +/// higher than 2. +static PAGE_COUNT: AtomicUsize = AtomicUsize::new(1); + +/// Allows us to get common arguments from the `user_regs_t` across architectures. +/// Normally this would land us ABI hell, but thankfully all of our usecases +/// consist of functions with a small number of register-sized integer arguments. +/// See for sources. +trait ArchIndependentRegs { + /// Gets the address of the instruction pointer. + fn ip(&self) -> usize; + /// Set the instruction pointer; remember to also set the stack pointer, or + /// else the stack might get messed up! + fn set_ip(&mut self, ip: usize); + /// Set the stack pointer, ideally to a zeroed-out area. + fn set_sp(&mut self, sp: usize); +} + +// It's fine / desirable behaviour for values to wrap here, we care about just +// preserving the bit pattern. +#[cfg(target_arch = "x86_64")] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.rip as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.rip = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.rsp = sp as _ } +} + +#[cfg(target_arch = "x86")] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.eip as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.eip = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.esp = sp as _ } +} + +#[cfg(target_arch = "aarch64")] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.pc as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.pc = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.sp = sp as _ } +} + +#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.pc as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.pc = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.sp = sp as _ } +} + /// A unified event representing something happening on the child process. Wraps /// `nix`'s `WaitStatus` and our custom signals so it can all be done with one /// `match` statement. @@ -22,7 +114,7 @@ pub enum ExecEvent { End, /// The child process with the specified pid was stopped by the given signal. Status(unistd::Pid, signal::Signal), - /// The child process with the specified pid entered or exited a syscall. + /// The child process with the specified pid entered or existed a syscall. Syscall(unistd::Pid), /// A child process exited or was killed; if we have a return code, it is /// specified. @@ -42,10 +134,10 @@ pub struct ChildListener { impl Iterator for ChildListener { type Item = ExecEvent; - // Allows us to monitor the child process by just iterating over the listener + // Allows us to monitor the child process by just iterating over the listener. // NB: This should never return None! fn next(&mut self) -> Option { - // Do not block if the child has nothing to report for `waitid` + // Do not block if the child has nothing to report for `waitid`. let opts = WAIT_FLAGS | wait::WaitPidFlag::WNOHANG; loop { // Listen to any child, not just the main one. Important if we want @@ -55,17 +147,17 @@ impl Iterator for ChildListener { match wait::waitid(wait::Id::All, opts) { Ok(stat) => match stat { - // Child exited normally with a specific code set + // Child exited normally with a specific code set. wait::WaitStatus::Exited(_, code) => { let code = self.override_retcode.unwrap_or(code); return Some(ExecEvent::Died(Some(code))); } - // Child was killed by a signal, without giving a code + // Child was killed by a signal, without giving a code. wait::WaitStatus::Signaled(_, _, _) => return Some(ExecEvent::Died(self.override_retcode)), // Child entered a syscall. Since we're always technically // tracing, only pass this along if we're actively - // monitoring the child + // monitoring the child. wait::WaitStatus::PtraceSyscall(pid) => if self.attached { return Some(ExecEvent::Syscall(pid)); @@ -84,11 +176,11 @@ impl Iterator for ChildListener { return Some(ExecEvent::Status(pid, signal)); } } else { - // Just pass along the signal + // Just pass along the signal. ptrace::cont(pid, signal).unwrap(); }, // Child was stopped at the given signal. Same logic as for - // WaitStatus::PtraceEvent + // WaitStatus::PtraceEvent. wait::WaitStatus::Stopped(pid, signal) => if self.attached { if signal == signal::SIGUSR1 { @@ -104,11 +196,11 @@ impl Iterator for ChildListener { }, // This case should only trigger if all children died and we // somehow missed that, but it's best we not allow any room - // for deadlocks + // for deadlocks. Err(_) => return Some(ExecEvent::Died(None)), } - // Similarly, do a non-blocking poll of the IPC channel + // Similarly, do a non-blocking poll of the IPC channel. if let Ok(req) = self.message_rx.try_recv() { match req { TraceRequest::StartFfi(info) => @@ -123,18 +215,16 @@ impl Iterator for ChildListener { } } - // Not ideal, but doing anything else might sacrifice performance + // Not ideal, but doing anything else might sacrifice performance. std::thread::yield_now(); } } } /// An error came up while waiting on the child process to do something. +/// It likely died, with this return code if we have one. #[derive(Debug)] -enum ExecError { - /// The child process died with this return code, if we have one. - Died(Option), -} +pub struct ExecEnd(pub Option); /// This is the main loop of the supervisor process. It runs in a separate /// process from the rest of Miri (but because we fork, addresses for anything @@ -144,35 +234,37 @@ pub fn sv_loop( init_pid: unistd::Pid, event_tx: ipc::IpcSender, confirm_tx: ipc::IpcSender, - _page_size: usize, -) -> Result> { - // Things that we return to the child process +) -> Result { + // Get the pagesize set and make sure it isn't still on the zero sentinel value! + let page_size = PAGE_SIZE.load(std::sync::atomic::Ordering::Relaxed); + assert_ne!(page_size, 0); + + // Things that we return to the child process. let mut acc_events = Vec::new(); - // Memory allocated on the MiriMachine - let mut _ch_pages = Vec::new(); - let mut _ch_stack = None; + // Memory allocated for the MiriMachine. + let mut ch_pages = Vec::new(); + let mut ch_stack = None; + + // An instance of the Capstone disassembler, so we don't spawn one on every access. + let cs = get_disasm(); // The pid of the last process we interacted with, used by default if we don't have a - // reason to use a different one + // reason to use a different one. let mut curr_pid = init_pid; - // There's an initial sigstop we need to deal with - wait_for_signal(Some(curr_pid), signal::SIGSTOP, false).map_err(|e| { - match e { - ExecError::Died(code) => code, - } - })?; + // There's an initial sigstop we need to deal with. + wait_for_signal(Some(curr_pid), signal::SIGSTOP, false)?; ptrace::cont(curr_pid, None).unwrap(); for evt in listener { match evt { - // start_ffi was called by the child, so prep memory + // start_ffi was called by the child, so prep memory. ExecEvent::Start(ch_info) => { - // All the pages that the child process is "allowed to" access - _ch_pages = ch_info.page_ptrs; - // And the fake stack it allocated for us to use later - _ch_stack = Some(ch_info.stack_ptr); + // All the pages that the child process is "allowed to" access. + ch_pages = ch_info.page_ptrs; + // And the fake stack it allocated for us to use later. + ch_stack = Some(ch_info.stack_ptr); // We received the signal and are no longer in the main listener loop, // so we can let the child move on to the end of start_ffi where it will @@ -180,39 +272,54 @@ pub fn sv_loop( // order to do most ptrace operations! confirm_tx.send(Confirmation).unwrap(); // We can't trust simply calling `Pid::this()` in the child process to give the right - // PID for us, so we get it this way + // PID for us, so we get it this way. curr_pid = wait_for_signal(None, signal::SIGSTOP, false).unwrap(); ptrace::syscall(curr_pid, None).unwrap(); } - // end_ffi was called by the child + // end_ffi was called by the child. ExecEvent::End => { - // Hand over the access info we traced + // Hand over the access info we traced. event_tx.send(MemEvents { acc_events }).unwrap(); - // And reset our values + // And reset our values. acc_events = Vec::new(); - _ch_stack = None; + ch_stack = None; - // No need to monitor syscalls anymore, they'd just be ignored + // No need to monitor syscalls anymore, they'd just be ignored. ptrace::cont(curr_pid, None).unwrap(); } // Child process was stopped by a signal - ExecEvent::Status(pid, signal) => { - eprintln!("Process unexpectedly got {signal}; continuing..."); - // In case we're not tracing - if ptrace::syscall(pid, signal).is_err() { - // If *this* fails too, something really weird happened - // and it's probably best to just panic - signal::kill(pid, signal::SIGCONT).unwrap(); - } - } + ExecEvent::Status(pid, signal) => + match signal { + // If it was a segfault, check if it was an artificial one + // caused by it trying to access the MiriMachine memory. + signal::SIGSEGV => + handle_segfault( + pid, + &ch_pages, + ch_stack.unwrap(), + page_size, + &cs, + &mut acc_events, + )?, + // Something weird happened. + _ => { + eprintln!("Process unexpectedly got {signal}; continuing..."); + // In case we're not tracing + if ptrace::syscall(pid, None).is_err() { + // If *this* fails too, something really weird happened + // and it's probably best to just panic. + signal::kill(pid, signal::SIGCONT).unwrap(); + } + } + }, // Child entered a syscall; we wait for exits inside of this, so it - // should never trigger on return from a syscall we care about + // should never trigger on return from a syscall we care about. ExecEvent::Syscall(pid) => { ptrace::syscall(pid, None).unwrap(); } ExecEvent::Died(code) => { - return Err(code); + return Err(ExecEnd(code)); } } } @@ -220,6 +327,30 @@ pub fn sv_loop( unreachable!() } +/// Spawns a Capstone disassembler for the host architecture. +#[rustfmt::skip] +fn get_disasm() -> capstone::Capstone { + use capstone::prelude::*; + let cs_pre = Capstone::new(); + { + #[cfg(target_arch = "x86_64")] + {cs_pre.x86().mode(arch::x86::ArchMode::Mode64)} + #[cfg(target_arch = "x86")] + {cs_pre.x86().mode(arch::x86::ArchMode::Mode32)} + #[cfg(target_arch = "aarch64")] + {cs_pre.arm64().mode(arch::arm64::ArchMode::Arm)} + #[cfg(target_arch = "arm")] + {cs_pre.arm().mode(arch::arm::ArchMode::Arm)} + #[cfg(target_arch = "riscv64")] + {cs_pre.riscv().mode(arch::riscv::ArchMode::RiscV64)} + #[cfg(target_arch = "riscv32")] + {cs_pre.riscv().mode(arch::riscv::ArchMode::RiscV32)} + } + .detail(true) + .build() + .unwrap() +} + /// Waits for `wait_signal`. If `init_cont`, it will first do a `ptrace::cont`. /// We want to avoid that in some cases, like at the beginning of FFI. /// @@ -228,27 +359,27 @@ fn wait_for_signal( pid: Option, wait_signal: signal::Signal, init_cont: bool, -) -> Result { +) -> Result { if init_cont { ptrace::cont(pid.unwrap(), None).unwrap(); } - // Repeatedly call `waitid` until we get the signal we want, or the process dies + // Repeatedly call `waitid` until we get the signal we want, or the process dies. loop { let wait_id = match pid { Some(pid) => wait::Id::Pid(pid), None => wait::Id::All, }; - let stat = wait::waitid(wait_id, WAIT_FLAGS).map_err(|_| ExecError::Died(None))?; + let stat = wait::waitid(wait_id, WAIT_FLAGS).map_err(|_| ExecEnd(None))?; let (signal, pid) = match stat { - // Report the cause of death, if we know it + // Report the cause of death, if we know it. wait::WaitStatus::Exited(_, code) => { - return Err(ExecError::Died(Some(code))); + return Err(ExecEnd(Some(code))); } - wait::WaitStatus::Signaled(_, _, _) => return Err(ExecError::Died(None)), + wait::WaitStatus::Signaled(_, _, _) => return Err(ExecEnd(None)), wait::WaitStatus::Stopped(pid, signal) => (signal, pid), wait::WaitStatus::PtraceEvent(pid, signal, _) => (signal, pid), // This covers PtraceSyscall and variants that are impossible with - // the flags set (e.g. WaitStatus::StillAlive) + // the flags set (e.g. WaitStatus::StillAlive). _ => { ptrace::cont(pid.unwrap(), None).unwrap(); continue; @@ -257,7 +388,302 @@ fn wait_for_signal( if signal == wait_signal { return Ok(pid); } else { - ptrace::cont(pid, None).map_err(|_| ExecError::Died(None))?; + ptrace::cont(pid, signal).map_err(|_| ExecEnd(None))?; + } + } +} + +/// Grabs the access that caused a segfault and logs it down if it's to our memory, +/// or kills the child and returns the appropriate error otherwise. +fn handle_segfault( + pid: unistd::Pid, + ch_pages: &[usize], + ch_stack: usize, + page_size: usize, + cs: &capstone::Capstone, + acc_events: &mut Vec, +) -> Result<(), ExecEnd> { + /// This is just here to not pollute the main namespace with `capstone::prelude::*`. + #[inline] + fn capstone_disassemble( + instr: &[u8], + addr: usize, + cs: &capstone::Capstone, + acc_events: &mut Vec, + ) -> capstone::CsResult<()> { + use capstone::prelude::*; + + // The arch_detail is what we care about, but it relies on these temporaries + // that we can't drop. 0x1000 is the default base address for Captsone, and + // we're expecting 1 instruction. + let insns = cs.disasm_count(instr, 0x1000, 1)?; + let ins_detail = cs.insn_detail(&insns[0])?; + let arch_detail = ins_detail.arch_detail(); + + for op in arch_detail.operands() { + match op { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + arch::ArchOperand::X86Operand(x86_operand) => { + match x86_operand.op_type { + // We only care about memory accesses + arch::x86::X86OperandType::Mem(_) => { + let push = addr..addr.strict_add(usize::from(x86_operand.size)); + // It's called a "RegAccessType" but it also applies to memory + let acc_ty = x86_operand.access.unwrap(); + if acc_ty.is_readable() { + acc_events.push(AccessEvent::Read(push.clone())); + } + if acc_ty.is_writable() { + acc_events.push(AccessEvent::Write(push)); + } + } + _ => (), + } + } + #[cfg(target_arch = "aarch64")] + arch::ArchOperand::Arm64Operand(arm64_operand) => { + // Annoyingly, we don't always get the size here, so just be pessimistic for now. + match arm64_operand.op_type { + arch::arm64::Arm64OperandType::Mem(_) => { + // B = 1 byte, H = 2 bytes, S = 4 bytes, D = 8 bytes, Q = 16 bytes. + let size = match arm64_operand.vas { + // Not an fp/simd instruction. + arch::arm64::Arm64Vas::ARM64_VAS_INVALID => ARCH_WORD_SIZE, + // 1 byte. + arch::arm64::Arm64Vas::ARM64_VAS_1B => 1, + // 2 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_1H => 2, + // 4 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_4B + | arch::arm64::Arm64Vas::ARM64_VAS_2H + | arch::arm64::Arm64Vas::ARM64_VAS_1S => 4, + // 8 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_8B + | arch::arm64::Arm64Vas::ARM64_VAS_4H + | arch::arm64::Arm64Vas::ARM64_VAS_2S + | arch::arm64::Arm64Vas::ARM64_VAS_1D => 8, + // 16 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_16B + | arch::arm64::Arm64Vas::ARM64_VAS_8H + | arch::arm64::Arm64Vas::ARM64_VAS_4S + | arch::arm64::Arm64Vas::ARM64_VAS_2D + | arch::arm64::Arm64Vas::ARM64_VAS_1Q => 16, + }; + let push = addr..addr.strict_add(size); + // FIXME: This now has access type info in the latest + // git version of capstone because this pissed me off + // and I added it. Change this when it updates. + acc_events.push(AccessEvent::Read(push.clone())); + acc_events.push(AccessEvent::Write(push)); + } + _ => (), + } + } + #[cfg(target_arch = "arm")] + arch::ArchOperand::ArmOperand(arm_operand) => + match arm_operand.op_type { + arch::arm::ArmOperandType::Mem(_) => { + // We don't get info on the size of the access, but + // we're at least told if it's a vector instruction. + let size = if arm_operand.vector_index.is_some() { + ARCH_MAX_ACCESS_SIZE + } else { + ARCH_WORD_SIZE + }; + let push = addr..addr.strict_add(size); + let acc_ty = arm_operand.access.unwrap(); + if acc_ty.is_readable() { + acc_events.push(AccessEvent::Read(push.clone())); + } + if acc_ty.is_writable() { + acc_events.push(AccessEvent::Write(push)); + } + } + _ => (), + }, + #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] + arch::ArchOperand::RiscVOperand(risc_voperand) => { + match risc_voperand { + arch::riscv::RiscVOperand::Mem(_) => { + // We get basically no info here. + let push = addr..addr.strict_add(size); + acc_events.push(AccessEvent::Read(push.clone())); + acc_events.push(AccessEvent::Write(push)); + } + _ => (), + } + } + _ => unimplemented!(), + } + } + + Ok(()) + } + + // Get information on what caused the segfault. This contains the address + // that triggered it. + let siginfo = ptrace::getsiginfo(pid).unwrap(); + // All x86, ARM, etc. instructions only have at most one memory operand + // (thankfully!) + // SAFETY: si_addr is safe to call. + let addr = unsafe { siginfo.si_addr().addr() }; + let page_addr = addr.strict_sub(addr.strict_rem(page_size)); + + if ch_pages.iter().any(|pg| (*pg..pg.strict_add(page_size)).contains(&addr)) { + // Overall structure: + // - Get the address that caused the segfault + // - Unprotect the memory + // - Step 1 instruction + // - Parse executed code to estimate size & type of access + // - Reprotect the memory + // - Continue + + // Ensure the stack is properly zeroed out! + for a in (ch_stack..ch_stack.strict_add(FAKE_STACK_SIZE)).step_by(ARCH_WORD_SIZE) { + ptrace::write(pid, std::ptr::with_exposed_provenance_mut(a), 0).unwrap(); + } + + // Guard against both architectures with upwards and downwards-growing stacks. + let stack_ptr = ch_stack.strict_add(FAKE_STACK_SIZE / 2); + let regs_bak = ptrace::getregs(pid).unwrap(); + let mut new_regs = regs_bak; + let ip_prestep = regs_bak.ip(); + + // Move the instr ptr into the deprotection code. + #[expect(clippy::as_conversions)] + new_regs.set_ip(mempr_off as usize); + // Don't mess up the stack by accident! + new_regs.set_sp(stack_ptr); + + // Modify the PAGE_ADDR global on the child process to point to the page + // that we want unprotected. + ptrace::write( + pid, + (&raw const PAGE_ADDR).cast_mut().cast(), + libc::c_long::try_from(page_addr).unwrap(), + ) + .unwrap(); + + // Check if we also own the next page, and if so unprotect it in case + // the access spans the page boundary. + let flag = if ch_pages.contains(&page_addr.strict_add(page_size)) { 2 } else { 1 }; + ptrace::write(pid, (&raw const PAGE_COUNT).cast_mut().cast(), flag).unwrap(); + + ptrace::setregs(pid, new_regs).unwrap(); + + // Our mempr_* functions end with a raise(SIGSTOP). + wait_for_signal(Some(pid), signal::SIGSTOP, true)?; + + // Step 1 instruction. + ptrace::setregs(pid, regs_bak).unwrap(); + ptrace::step(pid, None).unwrap(); + // Don't use wait_for_signal here since 1 instruction doesn't give room + // for any uncertainty + we don't want it `cont()`ing randomly by accident + // Also, don't let it continue with unprotected memory if something errors! + let _ = wait::waitid(wait::Id::Pid(pid), WAIT_FLAGS).map_err(|_| ExecEnd(None))?; + + // Zero out again to be safe + for a in (ch_stack..ch_stack.strict_add(FAKE_STACK_SIZE)).step_by(ARCH_WORD_SIZE) { + ptrace::write(pid, std::ptr::with_exposed_provenance_mut(a), 0).unwrap(); } + + // Save registers and grab the bytes that were executed. This would + // be really nasty if it was a jump or similar but those thankfully + // won't do memory accesses and so can't trigger this! + let regs_bak = ptrace::getregs(pid).unwrap(); + new_regs = regs_bak; + let ip_poststep = regs_bak.ip(); + // We need to do reads/writes in word-sized chunks. + let diff = (ip_poststep.strict_sub(ip_prestep)).div_ceil(ARCH_WORD_SIZE); + let instr = (ip_prestep..ip_prestep.strict_add(diff)).fold(vec![], |mut ret, ip| { + // This only needs to be a valid pointer in the child process, not ours. + ret.append( + &mut ptrace::read(pid, std::ptr::without_provenance_mut(ip)) + .unwrap() + .to_ne_bytes() + .to_vec(), + ); + ret + }); + + // Now figure out the size + type of access and log it down + // This will mark down e.g. the same area being read multiple times, + // since it's more efficient to compress the accesses at the end. + if capstone_disassemble(&instr, addr, cs, acc_events).is_err() { + // Read goes first because we need to be pessimistic. + acc_events.push(AccessEvent::Read(addr..addr.strict_add(ARCH_MAX_ACCESS_SIZE))); + acc_events.push(AccessEvent::Write(addr..addr.strict_add(ARCH_MAX_ACCESS_SIZE))); + } + + // Reprotect everything and continue. + #[expect(clippy::as_conversions)] + new_regs.set_ip(mempr_on as usize); + new_regs.set_sp(stack_ptr); + ptrace::setregs(pid, new_regs).unwrap(); + wait_for_signal(Some(pid), signal::SIGSTOP, true)?; + + ptrace::setregs(pid, regs_bak).unwrap(); + ptrace::syscall(pid, None).unwrap(); + Ok(()) + } else { + // This was a real segfault, so print some debug info and quit. + let regs = ptrace::getregs(pid).unwrap(); + eprintln!("Segfault occurred during FFI at {addr:#018x}"); + eprintln!("Expected access on pages: {ch_pages:#018x?}"); + eprintln!("Register dump: {regs:#x?}"); + ptrace::kill(pid).unwrap(); + Err(ExecEnd(None)) + } +} + +// We only get dropped into these functions via offsetting the instr pointer +// manually, so we *must not ever* unwind from them. + +/// Disables protections on the page whose address is currently in `PAGE_ADDR`. +/// +/// SAFETY: `PAGE_ADDR` should be set to a page-aligned pointer to an owned page, +/// `PAGE_SIZE` should be the host pagesize, and the range from `PAGE_ADDR` to +/// `PAGE_SIZE` * `PAGE_COUNT` must be owned and allocated memory. No other threads +/// should be running. +pub unsafe extern "C" fn mempr_off() { + use std::sync::atomic::Ordering; + + // Again, cannot allow unwinds to happen here. + let len = PAGE_SIZE.load(Ordering::Relaxed).saturating_mul(PAGE_COUNT.load(Ordering::Relaxed)); + // SAFETY: Upheld by "caller". + unsafe { + // It's up to the caller to make sure this doesn't actually overflow, but + // we mustn't unwind from here, so... + if libc::mprotect( + PAGE_ADDR.load(Ordering::Relaxed).cast(), + len, + libc::PROT_READ | libc::PROT_WRITE, + ) != 0 + { + // Can't return or unwind, but we can do this. + std::process::exit(-1); + } + } + // If this fails somehow we're doomed. + if signal::raise(signal::SIGSTOP).is_err() { + std::process::exit(-1); + } +} + +/// Reenables protection on the page set by `PAGE_ADDR`. +/// +/// SAFETY: See `mempr_off()`. +pub unsafe extern "C" fn mempr_on() { + use std::sync::atomic::Ordering; + + let len = PAGE_SIZE.load(Ordering::Relaxed).wrapping_mul(PAGE_COUNT.load(Ordering::Relaxed)); + // SAFETY: Upheld by "caller". + unsafe { + if libc::mprotect(PAGE_ADDR.load(Ordering::Relaxed).cast(), len, libc::PROT_NONE) != 0 { + std::process::exit(-1); + } + } + if signal::raise(signal::SIGSTOP).is_err() { + std::process::exit(-1); } }