diff --git a/CHANGELOG.md b/CHANGELOG.md index 51d8dca8072..c7398981243 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ ### Added +- Added support for PVH boot mode. This is used when an x86 kernel provides + the appropriate ELF Note to indicate that PVH boot mode is supported. + Linux kernels compiled with CONFIG_XEN_PVH=y set this ELF Note, as do + FreeBSD kernels. + ### Changed - Updated deserialization of `bitmap` for custom CPU templates to allow usage diff --git a/docs/pvh.md b/docs/pvh.md new file mode 100644 index 00000000000..4a351c301ec --- /dev/null +++ b/docs/pvh.md @@ -0,0 +1,15 @@ +# PVH boot mode + +Firecracker supports booting x86 kernels in "PVH direct boot" mode +[as specified by the Xen project](https://github.com/xen-project/xen/blob/master/docs/misc/pvh.pandoc). +If a kernel is provided which contains the XEN_ELFNOTE_PHYS32_ENTRY ELF Note +then this boot mode will be used. This boot mode was designed for virtualized +environments which load the kernel directly, and is simpler than the "Linux +boot" mode which is designed to be launched from a legacy boot loader. + +PVH boot mode can be enabled for Linux by setting CONFIG_XEN_PVH=y in the +kernel configuration. (This is not the default setting.) + +PVH boot mode is enabled by default in FreeBSD, which has support for +Firecracker starting with FreeBSD 14.0. Instructions on building a FreeBSD +kernel and root filesystem are available [here](rootfs-and-kernel-setup.md). diff --git a/docs/rootfs-and-kernel-setup.md b/docs/rootfs-and-kernel-setup.md index 1ba063b47e5..082774adcda 100644 --- a/docs/rootfs-and-kernel-setup.md +++ b/docs/rootfs-and-kernel-setup.md @@ -1,6 +1,6 @@ # Creating Custom rootfs and kernel Images -## Creating a kernel Image +## Creating a Linux kernel Image ### Manual compilation @@ -72,7 +72,7 @@ config="resources/guest_configs/microvm-kernel-arm64-4.14.config" on an aarch64 machine. -## Creating a rootfs Image +## Creating a Linux rootfs Image A rootfs image is just a file system image, that hosts at least an init system. For instance, our getting started guide uses an ext4 filesystem image. Note @@ -178,3 +178,45 @@ adjust the script(s) to suit your use case. You should now have a kernel image (`vmlinux`) and a rootfs image (`rootfs.ext4`), that you can boot with Firecracker. + +## Creating FreeBSD rootfs and kernel Images + +Here's a quick step-by-step guide to building a FreeBSD rootfs and kernel that +Firecracker can boot: + +1. Boot a FreeBSD system. In EC2, the + [FreeBSD 13 Marketplace image](https://aws.amazon.com/marketplace/pp/prodview-ukzmy5dzc6nbq) + is a good option; you can also use weekly snapshot AMIs published by the + FreeBSD project. (Firecracker support is in FreeBSD 14 and later, so you'll + need FreeBSD 13 or later to build it.) + + The build will require about 50 GB of disk space, so size the disk + appropriately. + +1. Log in to the FreeBSD system and become root. If using EC2, you'll want to + ssh in as `ec2-user` with your chosen SSH key and then `su` to become root. + +1. Install git and check out the FreeBSD src tree: + + ```sh + pkg install -y git + git clone https://git.freebsd.org/src.git /usr/src + ``` + + At present (July 2023) Firecracker support is only present in the `main` + branch. + +1. Build FreeBSD: + + ```sh + make -C /usr/src buildworld buildkernel KERNCONF=FIRECRACKER + make -C /usr/src/release firecracker DESTDIR=`pwd` + ``` + +You should now have a rootfs `freebsd-rootfs.bin` and a kernel `freebsd-kern.bin` +in the current directory (or elsewhere if you change the `DESTDIR` value) that +you can boot with Firecracker. Note that the FreeBSD rootfs generated in this +manner is somewhat minimized compared to "stock" FreeBSD; it omits utilities +which are only relevant on physical systems (e.g., utilities related to floppy +disks, USB devices, and some network interfaces) and also debug files and the +system compiler. diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 83b373af445..c574dda19a8 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -60,3 +60,34 @@ impl fmt::Display for DeviceType { write!(f, "{:?}", self) } } + +/// Suported boot protocols for +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum BootProtocol { + /// Linux 64-bit boot protocol + LinuxBoot, + #[cfg(target_arch = "x86_64")] + /// PVH boot protocol (x86/HVM direct boot ABI) + PvhBoot, +} + +impl fmt::Display for BootProtocol { + fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + match self { + BootProtocol::LinuxBoot => write!(f, "Linux 64-bit boot protocol"), + #[cfg(target_arch = "x86_64")] + BootProtocol::PvhBoot => write!(f, "PVH boot protocol"), + } + } +} + +#[derive(Debug, Copy, Clone)] +/// Specifies the entry point address where the guest must start +/// executing code, as well as which boot protocol is to be used +/// to configure the guest initial state. +pub struct EntryPoint { + /// Address in guest memory where the guest must start execution + pub entry_addr: utils::vm_memory::GuestAddress, + /// Specifies which boot protocol to use + pub protocol: BootProtocol, +} diff --git a/src/vmm/src/arch/x86_64/gdt.rs b/src/vmm/src/arch/x86_64/gdt.rs index c7fcbf31bf0..b34ca8c3d64 100644 --- a/src/vmm/src/arch/x86_64/gdt.rs +++ b/src/vmm/src/arch/x86_64/gdt.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -24,8 +26,37 @@ fn get_base(entry: u64) -> u64 { | (((entry) & 0x0000_0000_FFFF_0000) >> 16) } +// Extract the segment limit from the GDT segment descriptor. +// +// In a segment descriptor, the limit field is 20 bits, so it can directly describe +// a range from 0 to 0xFFFFF (1 MB). When G flag is set (4-KByte page granularity) it +// scales the value in the limit field by a factor of 2^12 (4 Kbytes), making the effective +// limit range from 0xFFF (4 KBytes) to 0xFFFF_FFFF (4 GBytes). +// +// However, the limit field in the VMCS definition is a 32 bit field, and the limit value is not +// automatically scaled using the G flag. This means that for a desired range of 4GB for a +// given segment, its limit must be specified as 0xFFFF_FFFF. Therefore the method of obtaining +// the limit from the GDT entry is not sufficient, since it only provides 20 bits when 32 bits +// are necessary. Fortunately, we can check if the G flag is set when extracting the limit since +// the full GDT entry is passed as an argument, and perform the scaling of the limit value to +// return the full 32 bit value. +// +// The scaling mentioned above is required when using PVH boot, since the guest boots in protected +// (32-bit) mode and must be able to access the entire 32-bit address space. It does not cause +// issues for the case of direct boot to 64-bit (long) mode, since in 64-bit mode the processor does +// not perform runtime limit checking on code or data segments. +// +// (For more information concerning the formats of segment descriptors, VMCS fields, et cetera, +// please consult the Intel Software Developer Manual.) fn get_limit(entry: u64) -> u32 { - ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32 + let limit: u32 = + ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32; + + // Perform manual limit scaling if G flag is set + match get_g(entry) { + 0 => limit, + _ => (limit << 12) | 0xFFF, // G flag is either 0 or 1 + } } fn get_g(entry: u64) -> u8 { @@ -109,7 +140,7 @@ mod tests { assert_eq!(0xB, seg.type_); // base and limit assert_eq!(0x10_0000, seg.base); - assert_eq!(0xfffff, seg.limit); + assert_eq!(0xffff_ffff, seg.limit); assert_eq!(0x0, seg.unusable); } } diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index 38776c1e7ed..84be9148967 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -27,5 +27,16 @@ pub const IRQ_MAX: u32 = 23; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; +/// Address of the hvm_start_info struct used in PVH boot +pub const PVH_INFO_START: u64 = 0x6000; + +/// Starting address of array of modules of hvm_modlist_entry type. +/// Used to enable initrd support using the PVH boot ABI. +pub const MODLIST_START: u64 = 0x6040; + +/// Address of memory map table used in PVH boot. Can overlap +/// with the zero page address since they are mutually exclusive. +pub const MEMMAP_START: u64 = 0x7000; + /// The 'zero page', a.k.a linux kernel bootparams. pub const ZERO_PAGE_START: u64 = 0x7000; diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 6d6db07a051..f4e402a9a5e 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -19,14 +21,19 @@ pub mod msr; pub mod regs; use linux_loader::configurator::linux::LinuxBootConfigurator; +use linux_loader::configurator::pvh::PvhBootConfigurator; use linux_loader::configurator::{BootConfigurator, BootParams}; use linux_loader::loader::bootparam::boot_params; +use linux_loader::loader::elf::start_info::{ + hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, +}; use utils::vm_memory::{Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; -use crate::arch::InitrdConfig; +use crate::arch::{BootProtocol, InitrdConfig}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 const E820_RAM: u32 = 1; +const MEMMAP_TYPE_RAM: u32 = 1; /// Errors thrown while configuring x86_64 system. #[derive(Debug, PartialEq, Eq, derive_more::From)] @@ -39,6 +46,12 @@ pub enum ConfigurationError { ZeroPageSetup, /// Failed to compute initrd address. InitrdAddress, + /// Error writing module entry to guest memory. + ModlistSetup, + /// Error writing memory map table to guest memory. + MemmapTableSetup, + /// Error writing hvm_start_info to guest memory. + StartInfoSetup, } // Where BIOS/VGA magic would live on a real PC. @@ -102,12 +115,139 @@ pub fn initrd_load_addr( /// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator. /// * `initrd` - Information about where the ramdisk image was loaded in the `guest_mem`. /// * `num_cpus` - Number of virtual CPUs the guest will have. +/// * `boot_prot` - Boot protocol that will be used to boot the guest. pub fn configure_system( guest_mem: &GuestMemoryMmap, cmdline_addr: GuestAddress, cmdline_size: usize, initrd: &Option, num_cpus: u8, + boot_prot: BootProtocol, +) -> Result<(), ConfigurationError> { + // Note that this puts the mptable at the last 1k of Linux's 640k base RAM + mptable::setup_mptable(guest_mem, num_cpus).map_err(ConfigurationError::MpTableSetup)?; + + match boot_prot { + BootProtocol::PvhBoot => { + configure_pvh(guest_mem, cmdline_addr, initrd)?; + } + BootProtocol::LinuxBoot => { + configure_64bit_boot(guest_mem, cmdline_addr, cmdline_size, initrd)?; + } + } + + Ok(()) +} + +fn configure_pvh( + guest_mem: &GuestMemoryMmap, + cmdline_addr: GuestAddress, + initrd: &Option, +) -> Result<(), ConfigurationError> { + const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; + let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); + let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let himem_start = GuestAddress(layout::HIMEM_START); + + // Vector to hold modules (currently either empty or holding initrd). + let mut modules: Vec = Vec::new(); + if let Some(initrd_config) = initrd { + // The initrd has been written to guest memory already, here we just need to + // create the module structure that describes it. + modules.push(hvm_modlist_entry { + paddr: initrd_config.address.raw_value(), + size: initrd_config.size as u64, + ..Default::default() + }); + } + + // Vector to hold the memory maps which needs to be written to guest memory + // at MEMMAP_START after all of the mappings are recorded. + let mut memmap: Vec = Vec::new(); + + // Create the memory map entries. + add_memmap_entry(&mut memmap, 0, EBDA_START, MEMMAP_TYPE_RAM)?; + let last_addr = guest_mem.last_addr(); + if last_addr < end_32bit_gap_start { + add_memmap_entry( + &mut memmap, + himem_start.raw_value(), + last_addr.unchecked_offset_from(himem_start) + 1, + MEMMAP_TYPE_RAM, + )?; + } else { + add_memmap_entry( + &mut memmap, + himem_start.raw_value(), + end_32bit_gap_start.unchecked_offset_from(himem_start), + MEMMAP_TYPE_RAM, + )?; + + if last_addr > first_addr_past_32bits { + add_memmap_entry( + &mut memmap, + first_addr_past_32bits.raw_value(), + last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, + MEMMAP_TYPE_RAM, + )?; + } + } + + // Construct the hvm_start_info structure and serialize it into + // boot_params. This will be stored at PVH_INFO_START address, and %rbx + // will be initialized to contain PVH_INFO_START prior to starting the + // guest, as required by the PVH ABI. + let mut start_info = hvm_start_info { + magic: XEN_HVM_START_MAGIC_VALUE, + version: 1, + cmdline_paddr: cmdline_addr.raw_value(), + memmap_paddr: layout::MEMMAP_START, + memmap_entries: memmap.len() as u32, + nr_modules: modules.len() as u32, + ..Default::default() + }; + if !modules.is_empty() { + start_info.modlist_paddr = layout::MODLIST_START; + } + let mut boot_params = + BootParams::new::(&start_info, GuestAddress(layout::PVH_INFO_START)); + + // Copy the vector with the memmap table to the MEMMAP_START address + // which is already saved in the memmap_paddr field of hvm_start_info struct. + boot_params.set_sections::(&memmap, GuestAddress(layout::MEMMAP_START)); + + // Copy the vector with the modules list to the MODLIST_START address. + // Note that we only set the modlist_paddr address if there is a nonzero + // number of modules, but serializing an empty list is harmless. + boot_params.set_modules::(&modules, GuestAddress(layout::MODLIST_START)); + + // Write the hvm_start_info struct to guest memory. + PvhBootConfigurator::write_bootparams(&boot_params, guest_mem) + .map_err(|_| ConfigurationError::StartInfoSetup) +} + +fn add_memmap_entry( + memmap: &mut Vec, + addr: u64, + size: u64, + mem_type: u32, +) -> Result<(), ConfigurationError> { + // Add the table entry to the vector + memmap.push(hvm_memmap_table_entry { + addr, + size, + type_: mem_type, + reserved: 0, + }); + + Ok(()) +} + +fn configure_64bit_boot( + guest_mem: &GuestMemoryMmap, + cmdline_addr: GuestAddress, + cmdline_size: usize, + initrd: &Option, ) -> Result<(), ConfigurationError> { const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55; const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; @@ -118,9 +258,6 @@ pub fn configure_system( let himem_start = GuestAddress(layout::HIMEM_START); - // Note that this puts the mptable at the last 1k of Linux's 640k base RAM - mptable::setup_mptable(guest_mem, num_cpus)?; - let mut params = boot_params::default(); params.hdr.type_of_loader = KERNEL_LOADER_OTHER; @@ -225,7 +362,8 @@ mod tests { false, ) .unwrap(); - let config_err = configure_system(&gm, GuestAddress(0), 0, &None, 1); + let config_err = + configure_system(&gm, GuestAddress(0), 0, &None, 1, BootProtocol::LinuxBoot); assert!(config_err.is_err()); assert_eq!( config_err.unwrap_err(), @@ -237,21 +375,72 @@ mod tests { let arch_mem_regions = arch_memory_regions(mem_size); let gm = utils::vm_memory::test_utils::create_anon_guest_memory(&arch_mem_regions, false) .unwrap(); - configure_system(&gm, GuestAddress(0), 0, &None, no_vcpus).unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::LinuxBoot, + ) + .unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::PvhBoot, + ) + .unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = 3328 << 20; let arch_mem_regions = arch_memory_regions(mem_size); let gm = utils::vm_memory::test_utils::create_anon_guest_memory(&arch_mem_regions, false) .unwrap(); - configure_system(&gm, GuestAddress(0), 0, &None, no_vcpus).unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::LinuxBoot, + ) + .unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::PvhBoot, + ) + .unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = 3330 << 20; let arch_mem_regions = arch_memory_regions(mem_size); let gm = utils::vm_memory::test_utils::create_anon_guest_memory(&arch_mem_regions, false) .unwrap(); - configure_system(&gm, GuestAddress(0), 0, &None, no_vcpus).unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::LinuxBoot, + ) + .unwrap(); + configure_system( + &gm, + GuestAddress(0), + 0, + &None, + no_vcpus, + BootProtocol::PvhBoot, + ) + .unwrap(); } #[test] @@ -293,4 +482,31 @@ mod tests { ) .is_err()); } + + #[test] + fn test_add_memmap_entry() { + const MEMMAP_TYPE_RESERVED: u32 = 2; + + let mut memmap: Vec = Vec::new(); + + let expected_memmap = vec![ + hvm_memmap_table_entry { + addr: 0x0, + size: 0x1000, + type_: MEMMAP_TYPE_RAM, + ..Default::default() + }, + hvm_memmap_table_entry { + addr: 0x10000, + size: 0xa000, + type_: MEMMAP_TYPE_RESERVED, + ..Default::default() + }, + ]; + + add_memmap_entry(&mut memmap, 0, 0x1000, MEMMAP_TYPE_RAM).unwrap(); + add_memmap_entry(&mut memmap, 0x10000, 0xa000, MEMMAP_TYPE_RESERVED).unwrap(); + + assert_eq!(format!("{:?}", memmap), format!("{:?}", expected_memmap)); + } } diff --git a/src/vmm/src/arch/x86_64/regs.rs b/src/vmm/src/arch/x86_64/regs.rs index 41d37389ef7..2919cd55619 100644 --- a/src/vmm/src/arch/x86_64/regs.rs +++ b/src/vmm/src/arch/x86_64/regs.rs @@ -1,3 +1,4 @@ +// Copyright © 2020, Oracle and/or its affiliates. // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +12,7 @@ use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs}; use kvm_ioctls::VcpuFd; use utils::vm_memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; +use super::super::{BootProtocol, EntryPoint}; use super::gdt::{gdt_entry, kvm_segment_from_gdt}; // Initial pagetables. @@ -89,20 +91,30 @@ pub struct SetupRegistersError(utils::errno::Error); /// # Errors /// /// When [`kvm_ioctls::ioctls::vcpu::VcpuFd::set_regs`] errors. -pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64) -> Result<(), SetupRegistersError> { - let regs: kvm_regs = kvm_regs { - rflags: 0x0000_0000_0000_0002u64, - rip: boot_ip, - // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are - // made to rsp (i.e. reserving space for local variables or pushing values on to the stack), - // local variables and function parameters are still accessible from a constant offset from - // rbp. - rsp: super::layout::BOOT_STACK_POINTER, - // Starting stack pointer. - rbp: super::layout::BOOT_STACK_POINTER, - // Must point to zero page address per Linux ABI. This is x86_64 specific. - rsi: super::layout::ZERO_PAGE_START, - ..Default::default() +pub fn setup_regs(vcpu: &VcpuFd, entry_point: EntryPoint) -> Result<(), SetupRegistersError> { + let regs: kvm_regs = match entry_point.protocol { + BootProtocol::PvhBoot => kvm_regs { + // Configure regs as required by PVH boot protocol. + rflags: 0x0000_0000_0000_0002u64, + rbx: super::layout::PVH_INFO_START, + rip: entry_point.entry_addr.raw_value(), + ..Default::default() + }, + BootProtocol::LinuxBoot => kvm_regs { + // Configure regs as required by Linux 64-bit boot protocol. + rflags: 0x0000_0000_0000_0002u64, + rip: entry_point.entry_addr.raw_value(), + // Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments + // are made to rsp (i.e. reserving space for local variables or pushing + // values on to the stack), local variables and function parameters are + // still accessible from a constant offset from rbp. + rsp: super::layout::BOOT_STACK_POINTER, + // Starting stack pointer. + rbp: super::layout::BOOT_STACK_POINTER, + // Must point to zero page address per Linux ABI. This is x86_64 specific. + rsi: super::layout::ZERO_PAGE_START, + ..Default::default() + }, }; vcpu.set_regs(®s).map_err(SetupRegistersError) @@ -131,6 +143,7 @@ pub enum SetupSpecialRegistersError { /// /// * `mem` - The memory that will be passed to the guest. /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +/// * `boot_prot` - The boot protocol being used. /// /// # Errors /// @@ -139,14 +152,21 @@ pub enum SetupSpecialRegistersError { /// - [`configure_segments_and_sregs`] errors. /// - [`setup_page_tables`] errors /// - [`kvm_ioctls::ioctls::vcpu::VcpuFd::set_sregs`] errors. -pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &VcpuFd) -> Result<(), SetupSpecialRegistersError> { +pub fn setup_sregs( + mem: &GuestMemoryMmap, + vcpu: &VcpuFd, + boot_prot: BootProtocol, +) -> Result<(), SetupSpecialRegistersError> { let mut sregs: kvm_sregs = vcpu .get_sregs() .map_err(SetupSpecialRegistersError::GetSpecialRegisters)?; - configure_segments_and_sregs(mem, &mut sregs) + configure_segments_and_sregs(mem, &mut sregs, boot_prot) .map_err(SetupSpecialRegistersError::ConfigureSegmentsAndSpecialRegisters)?; - setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?; // TODO(dgreid) - Can this be done once per system instead? + if let BootProtocol::LinuxBoot = boot_prot { + setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?; + // TODO(dgreid) - Can this be done once per system instead? + } vcpu.set_sregs(&sregs) .map_err(SetupSpecialRegistersError::SetSpecialRegisters) @@ -161,6 +181,7 @@ const EFER_LMA: u64 = 0x400; const EFER_LME: u64 = 0x100; const X86_CR0_PE: u64 = 0x1; +const X86_CR0_ET: u64 = 0x10; const X86_CR0_PG: u64 = 0x8000_0000; const X86_CR4_PAE: u64 = 0x20; @@ -187,13 +208,28 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<(), RegsErro fn configure_segments_and_sregs( mem: &GuestMemoryMmap, sregs: &mut kvm_sregs, + boot_prot: BootProtocol, ) -> Result<(), RegsError> { - let gdt_table: [u64; BOOT_GDT_MAX] = [ - gdt_entry(0, 0, 0), // NULL - gdt_entry(0xa09b, 0, 0xfffff), // CODE - gdt_entry(0xc093, 0, 0xfffff), // DATA - gdt_entry(0x808b, 0, 0xfffff), // TSS - ]; + let gdt_table: [u64; BOOT_GDT_MAX] = match boot_prot { + BootProtocol::PvhBoot => { + // Configure GDT entries as specified by PVH boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xc09b, 0, 0xffff_ffff), // CODE + gdt_entry(0xc093, 0, 0xffff_ffff), // DATA + gdt_entry(0x008b, 0, 0x67), // TSS + ] + } + BootProtocol::LinuxBoot => { + // Configure GDT entries as specified by Linux 64bit boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ] + } + }; let code_seg = kvm_segment_from_gdt(gdt_table[1], 1); let data_seg = kvm_segment_from_gdt(gdt_table[2], 2); @@ -216,9 +252,17 @@ fn configure_segments_and_sregs( sregs.ss = data_seg; sregs.tr = tss_seg; - // 64-bit protected mode - sregs.cr0 |= X86_CR0_PE; - sregs.efer |= EFER_LME | EFER_LMA; + match boot_prot { + BootProtocol::PvhBoot => { + sregs.cr0 = X86_CR0_PE | X86_CR0_ET; + sregs.cr4 = 0; + } + BootProtocol::LinuxBoot => { + // 64-bit protected mode + sregs.cr0 |= X86_CR0_PE; + sregs.efer |= EFER_LME | EFER_LMA; + } + } Ok(()) } @@ -279,24 +323,45 @@ mod tests { gm.read_obj(read_addr).unwrap() } - fn validate_segments_and_sregs(gm: &GuestMemoryMmap, sregs: &kvm_sregs) { + fn validate_segments_and_sregs( + gm: &GuestMemoryMmap, + sregs: &kvm_sregs, + boot_prot: BootProtocol, + ) { + if let BootProtocol::LinuxBoot = boot_prot { + assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8)); + assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16)); + assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24)); + + assert_eq!(0xffff_ffff, sregs.tr.limit); + + assert!(sregs.cr0 & X86_CR0_PE != 0); + assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0); + } else { + // Validate values that are specific to PVH boot protocol + assert_eq!(0xcf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8)); + assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16)); + assert_eq!(0x00_8b00_0000_0067, read_u64(gm, BOOT_GDT_OFFSET + 24)); + + assert_eq!(0x67, sregs.tr.limit); + assert_eq!(0, sregs.tr.g); + + assert!(sregs.cr0 & X86_CR0_PE != 0 && sregs.cr0 & X86_CR0_ET != 0); + assert_eq!(0, sregs.cr4); + } + + // Common settings for both PVH and Linux boot protocol assert_eq!(0x0, read_u64(gm, BOOT_GDT_OFFSET)); - assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8)); - assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16)); - assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24)); assert_eq!(0x0, read_u64(gm, BOOT_IDT_OFFSET)); assert_eq!(0, sregs.cs.base); - assert_eq!(0xfffff, sregs.ds.limit); + assert_eq!(0xffff_ffff, sregs.ds.limit); assert_eq!(0x10, sregs.es.selector); assert_eq!(1, sregs.fs.present); assert_eq!(1, sregs.gs.g); assert_eq!(0, sregs.ss.avl); assert_eq!(0, sregs.tr.base); - assert_eq!(0xfffff, sregs.tr.limit); assert_eq!(0, sregs.tr.avl); - assert!(sregs.cr0 & X86_CR0_PE != 0); - assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0); } fn validate_page_tables(gm: &GuestMemoryMmap, sregs: &kvm_sregs) { @@ -348,7 +413,12 @@ mod tests { ..Default::default() }; - setup_regs(&vcpu, expected_regs.rip).unwrap(); + let entry_point: EntryPoint = EntryPoint { + entry_addr: GuestAddress(expected_regs.rip), + protocol: BootProtocol::LinuxBoot, + }; + + setup_regs(&vcpu, entry_point).unwrap(); let actual_regs: kvm_regs = vcpu.get_regs().unwrap(); assert_eq!(actual_regs, expected_regs); @@ -361,16 +431,22 @@ mod tests { let vcpu = vm.create_vcpu(0).unwrap(); let gm = create_guest_mem(None); - assert!(vcpu.set_sregs(&Default::default()).is_ok()); - setup_sregs(&gm, &vcpu).unwrap(); - - let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap(); - // for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment. - // We set it to 1, otherwise the test will fail. - sregs.gs.g = 1; - - validate_segments_and_sregs(&gm, &sregs); - validate_page_tables(&gm, &sregs); + [BootProtocol::LinuxBoot, BootProtocol::PvhBoot] + .iter() + .for_each(|boot_prot| { + assert!(vcpu.set_sregs(&Default::default()).is_ok()); + setup_sregs(&gm, &vcpu, *boot_prot).unwrap(); + + let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap(); + // for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment. + // We set it to 1, otherwise the test will fail. + sregs.gs.g = 1; + + validate_segments_and_sregs(&gm, &sregs, *boot_prot); + if let BootProtocol::LinuxBoot = *boot_prot { + validate_page_tables(&gm, &sregs); + } + }); } #[test] @@ -415,9 +491,13 @@ mod tests { fn test_configure_segments_and_sregs() { let mut sregs: kvm_sregs = Default::default(); let gm = create_guest_mem(None); - configure_segments_and_sregs(&gm, &mut sregs).unwrap(); + configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::LinuxBoot).unwrap(); + + validate_segments_and_sregs(&gm, &sregs, BootProtocol::LinuxBoot); + + configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::PvhBoot).unwrap(); - validate_segments_and_sregs(&gm, &sregs); + validate_segments_and_sregs(&gm, &sregs, BootProtocol::PvhBoot); } #[test] diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5e63d95e004..eb75dd2b00b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -14,10 +14,15 @@ use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; #[cfg(target_arch = "x86_64")] use linux_loader::loader::elf::Elf as Loader; +#[cfg(target_arch = "x86_64")] +use linux_loader::loader::elf::PvhBootCapability; #[cfg(target_arch = "aarch64")] use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::KernelLoader; +#[cfg(target_arch = "aarch64")] use log::error; +#[cfg(target_arch = "x86_64")] +use log::{debug, error}; use seccompiler::BpfThreadMap; use snapshot::Persist; use userfaultfd::Uffd; @@ -28,7 +33,7 @@ use utils::vm_memory::{GuestAddress, GuestMemory, GuestMemoryMmap, ReadVolatile} use vm_superio::Rtc; use vm_superio::Serial; -use crate::arch::InitrdConfig; +use crate::arch::{BootProtocol, EntryPoint, InitrdConfig}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ @@ -256,7 +261,7 @@ pub fn build_microvm_for_boot( let track_dirty_pages = vm_resources.track_dirty_pages(); let guest_memory = create_guest_memory(vm_resources.vm_config.mem_size_mib, track_dirty_pages)?; - let entry_addr = load_kernel(boot_config, &guest_memory)?; + let entry_point = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; // Clone the command-line so that a failed boot doesn't pollute the original. #[allow(unused_mut)] @@ -310,7 +315,7 @@ pub fn build_microvm_for_boot( &vmm, vcpus.as_mut(), &vm_resources.vm_config, - entry_addr, + entry_point, &initrd, boot_cmdline, )?; @@ -544,16 +549,16 @@ pub fn create_guest_memory( .map_err(StartMicrovmError::GuestMemoryMmap) } +#[cfg(target_arch = "x86_64")] fn load_kernel( boot_config: &BootConfig, guest_memory: &GuestMemoryMmap, -) -> Result { +) -> Result { let mut kernel_file = boot_config .kernel_file .try_clone() .map_err(|err| StartMicrovmError::Internal(VmmError::KernelFile(err)))?; - #[cfg(target_arch = "x86_64")] let entry_addr = Loader::load::( guest_memory, None, @@ -562,7 +567,32 @@ fn load_kernel( ) .map_err(StartMicrovmError::KernelLoader)?; - #[cfg(target_arch = "aarch64")] + let mut entry_point_addr: GuestAddress = entry_addr.kernel_load; + let mut boot_prot: BootProtocol = BootProtocol::LinuxBoot; + if let PvhBootCapability::PvhEntryPresent(pvh_entry_addr) = entry_addr.pvh_boot_cap { + // Use the PVH kernel entry point to boot the guest + entry_point_addr = pvh_entry_addr; + boot_prot = BootProtocol::PvhBoot; + } + + debug!("Kernel loaded using {boot_prot}"); + + Ok(EntryPoint { + entry_addr: entry_point_addr, + protocol: boot_prot, + }) +} + +#[cfg(target_arch = "aarch64")] +fn load_kernel( + boot_config: &BootConfig, + guest_memory: &GuestMemoryMmap, +) -> Result { + let mut kernel_file = boot_config + .kernel_file + .try_clone() + .map_err(|err| StartMicrovmError::Internal(VmmError::KernelFile(err)))?; + let entry_addr = Loader::load::( guest_memory, Some(GuestAddress(crate::arch::get_kernel_start())), @@ -571,7 +601,10 @@ fn load_kernel( ) .map_err(StartMicrovmError::KernelLoader)?; - Ok(entry_addr.kernel_load) + Ok(EntryPoint { + entry_addr: entry_addr.kernel_load, + protocol: BootProtocol::LinuxBoot, + }) } fn load_initrd_from_config( @@ -743,7 +776,7 @@ pub fn configure_system_for_boot( vmm: &Vmm, vcpus: &mut [Vcpu], vm_config: &VmConfig, - entry_addr: GuestAddress, + entry_point: EntryPoint, initrd: &Option, boot_cmdline: LoaderKernelCmdline, ) -> Result<(), StartMicrovmError> { @@ -787,7 +820,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(vmm.guest_memory(), entry_addr, &vcpu_config) + .configure(vmm.guest_memory(), entry_point, &vcpu_config) .map_err(VmmError::VcpuConfigure) .map_err(Internal)?; } @@ -812,6 +845,7 @@ pub fn configure_system_for_boot( cmdline_size, initrd, vcpus.len() as u8, + entry_point.protocol, ) .map_err(ConfigureSystem)?; } diff --git a/src/vmm/src/vstate/vcpu/aarch64.rs b/src/vmm/src/vstate/vcpu/aarch64.rs index c4c3257e2c2..5fbeb60acbe 100644 --- a/src/vmm/src/vstate/vcpu/aarch64.rs +++ b/src/vmm/src/vstate/vcpu/aarch64.rs @@ -7,7 +7,7 @@ use kvm_ioctls::*; use logger::{error, IncMetric, METRICS}; -use utils::vm_memory::{Address, GuestAddress, GuestMemoryMmap}; +use utils::vm_memory::{Address, GuestMemoryMmap}; use versionize::{VersionMap, Versionize, VersionizeError, VersionizeResult}; use versionize_derive::Versionize; @@ -18,6 +18,7 @@ use crate::arch::aarch64::vcpu::{ get_all_registers, get_all_registers_ids, get_mpidr, get_mpstate, get_registers, set_mpstate, set_registers, setup_boot_regs, VcpuError as ArchError, }; +use crate::arch::EntryPoint; use crate::cpu_config::templates::CpuConfiguration; use crate::vcpu::{VcpuConfig, VcpuError}; use crate::vstate::vcpu::VcpuEmulation; @@ -86,12 +87,13 @@ impl KvmVcpu { /// # Arguments /// /// * `guest_mem` - The guest memory used by this microvm. - /// * `kernel_load_addr` - Offset from `guest_mem` at which the kernel is loaded. + /// * `kernel_entry_point` - Specifies the boot protocol and offset from `guest_mem` at which + /// the kernel starts. /// * `vcpu_config` - The vCPU configuration. pub fn configure( &mut self, guest_mem: &GuestMemoryMmap, - kernel_load_addr: GuestAddress, + kernel_entry_point: EntryPoint, vcpu_config: &VcpuConfig, ) -> Result<(), KvmVcpuError> { for reg in vcpu_config.cpu_config.regs.iter() { @@ -103,7 +105,7 @@ impl KvmVcpu { setup_boot_regs( &self.fd, self.index, - kernel_load_addr.raw_value(), + kernel_entry_point.entry_addr.raw_value(), guest_mem, ) .map_err(KvmVcpuError::ConfigureRegisters)?; @@ -228,10 +230,11 @@ mod tests { use std::os::unix::io::AsRawFd; use kvm_bindings::KVM_REG_SIZE_U64; - use utils::vm_memory::GuestMemoryMmap; + use utils::vm_memory::{GuestAddress, GuestMemoryMmap}; use super::*; use crate::arch::aarch64::regs::Aarch64RegisterRef; + use crate::arch::BootProtocol; use crate::cpu_config::aarch64::CpuConfiguration; use crate::vcpu::VcpuConfig; use crate::vstate::vm::tests::setup_vm; @@ -278,7 +281,10 @@ mod tests { assert!(vcpu .configure( &vm_mem, - GuestAddress(crate::arch::get_kernel_start()), + EntryPoint { + entry_addr: GuestAddress(crate::arch::get_kernel_start()), + protocol: BootProtocol::LinuxBoot, + }, &vcpu_config, ) .is_ok()); @@ -287,7 +293,10 @@ mod tests { let err = vcpu.configure( &vm_mem, - GuestAddress(crate::arch::get_kernel_start()), + EntryPoint { + entry_addr: GuestAddress(crate::arch::get_kernel_start()), + protocol: BootProtocol::LinuxBoot, + }, &vcpu_config, ); assert!(err.is_err()); diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index e0b3ae94c81..87d7caad3db 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -721,6 +721,7 @@ pub mod tests { use utils::vm_memory::{GuestAddress, GuestMemoryMmap}; use super::*; + use crate::arch::{BootProtocol, EntryPoint}; use crate::builder::StartMicrovmError; use crate::devices::bus::DummyDevice; use crate::devices::BusDevice; @@ -945,7 +946,10 @@ pub mod tests { let vcpu_exit_evt = vcpu.exit_evt.try_clone().unwrap(); // Needs a kernel since we'll actually run this vcpu. - let entry_addr = load_good_kernel(&vm_mem); + let entry_point = EntryPoint { + entry_addr: load_good_kernel(&vm_mem), + protocol: BootProtocol::LinuxBoot, + }; #[cfg(target_arch = "x86_64")] { @@ -953,7 +957,7 @@ pub mod tests { vcpu.kvm_vcpu .configure( &vm_mem, - entry_addr, + entry_point, &VcpuConfig { vcpu_count: 1, smt: false, @@ -970,7 +974,7 @@ pub mod tests { vcpu.kvm_vcpu .configure( &vm_mem, - entry_addr, + entry_point, &VcpuConfig { vcpu_count: 1, smt: false, diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs index cb9e3ba351c..041ec27de23 100644 --- a/src/vmm/src/vstate/vcpu/x86_64.rs +++ b/src/vmm/src/vstate/vcpu/x86_64.rs @@ -14,13 +14,14 @@ use kvm_bindings::{ use kvm_ioctls::{VcpuExit, VcpuFd}; use log::{error, warn}; use logger::{IncMetric, METRICS}; -use utils::vm_memory::{Address, GuestAddress, GuestMemoryMmap}; +use utils::vm_memory::GuestMemoryMmap; use versionize::{VersionMap, Versionize, VersionizeError, VersionizeResult}; use versionize_derive::Versionize; use crate::arch::x86_64::interrupts; use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError}; use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError}; +use crate::arch::EntryPoint; use crate::cpu_config::x86_64::{cpuid, CpuConfiguration}; use crate::vstate::vcpu::{VcpuConfig, VcpuEmulation}; use crate::vstate::vm::Vm; @@ -206,13 +207,14 @@ impl KvmVcpu { /// # Arguments /// /// * `guest_mem` - The guest memory used by this microvm. - /// * `kernel_start_addr` - Offset from `guest_mem` at which the kernel starts. + /// * `kernel_entry_point` - Specifies the boot protocol and offset from `guest_mem` at which + /// the kernel starts. /// * `vcpu_config` - The vCPU configuration. /// * `cpuid` - The capabilities exposed by this vCPU. pub fn configure( &mut self, guest_mem: &GuestMemoryMmap, - kernel_start_addr: GuestAddress, + kernel_entry_point: EntryPoint, vcpu_config: &VcpuConfig, ) -> Result<(), KvmVcpuConfigureError> { let mut cpuid = vcpu_config.cpu_config.cpuid.clone(); @@ -272,11 +274,10 @@ impl KvmVcpu { .collect::>(); crate::arch::x86_64::msr::set_msrs(&self.fd, &kvm_msrs)?; - crate::arch::x86_64::regs::setup_regs(&self.fd, kernel_start_addr.raw_value())?; + crate::arch::x86_64::regs::setup_regs(&self.fd, kernel_entry_point)?; crate::arch::x86_64::regs::setup_fpu(&self.fd)?; - crate::arch::x86_64::regs::setup_sregs(guest_mem, &self.fd)?; + crate::arch::x86_64::regs::setup_sregs(guest_mem, &self.fd, kernel_entry_point.protocol)?; crate::arch::x86_64::interrupts::set_lint(&self.fd)?; - Ok(()) } @@ -655,9 +656,11 @@ mod tests { use std::os::unix::io::AsRawFd; use kvm_ioctls::Cap; + use utils::vm_memory::GuestAddress; use super::*; use crate::arch::x86_64::cpu_model::CpuModel; + use crate::arch::BootProtocol; use crate::cpu_config::templates::{ CpuConfiguration, CpuTemplateType, CustomCpuTemplate, GetCpuTemplate, GuestConfigError, StaticCpuTemplate, @@ -728,7 +731,14 @@ mod tests { let vcpu_config = create_vcpu_config(&vm, &vcpu, &CustomCpuTemplate::default()).unwrap(); assert_eq!( - vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config,), + vcpu.configure( + &vm_mem, + EntryPoint { + entry_addr: GuestAddress(0), + protocol: BootProtocol::LinuxBoot, + }, + &vcpu_config, + ), Ok(()) ); @@ -740,7 +750,10 @@ mod tests { Ok(config) => vcpu .configure( &vm_mem, - GuestAddress(crate::arch::get_kernel_start()), + EntryPoint { + entry_addr: GuestAddress(crate::arch::get_kernel_start()), + protocol: BootProtocol::LinuxBoot, + }, &config, ) .is_ok(), @@ -843,8 +856,15 @@ mod tests { msrs: HashMap::new(), }, }; - vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config) - .unwrap(); + vcpu.configure( + &vm_mem, + EntryPoint { + entry_addr: GuestAddress(0), + protocol: BootProtocol::LinuxBoot, + }, + &vcpu_config, + ) + .unwrap(); // Invalid entries filled with 0 should not exist. let cpuid = vcpu.get_cpuid().unwrap(); @@ -905,8 +925,15 @@ mod tests { msrs: HashMap::new(), }, }; - vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config) - .unwrap(); + vcpu.configure( + &vm_mem, + EntryPoint { + entry_addr: GuestAddress(0), + protocol: BootProtocol::LinuxBoot, + }, + &vcpu_config, + ) + .unwrap(); assert!(vcpu.dump_cpu_config().is_ok()); } diff --git a/tests/integration_tests/style/test_licenses.py b/tests/integration_tests/style/test_licenses.py index 6ee744e9e8b..6a2412782a0 100644 --- a/tests/integration_tests/style/test_licenses.py +++ b/tests/integration_tests/style/test_licenses.py @@ -23,6 +23,8 @@ ) ALIBABA_COPYRIGHT = "Copyright (C) 2019 Alibaba Cloud Computing. All rights reserved." ALIBABA_LICENSE = "SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause" +ORACLE_COPYRIGHT = "Copyright © 2020, Oracle and/or its affiliates." +ORACLE_LICENSE = "SPDX-License-Identifier: Apache-2.0" EXCLUDE = ["build", ".kernel", ".git"] @@ -77,11 +79,16 @@ def _validate_license(filename): ALIBABA_COPYRIGHT in copyright_info and _look_for_license(file, ALIBABA_LICENSE) ) + + has_oracle_copyright = ORACLE_COPYRIGHT in copyright_info and _look_for_license( + file, ORACLE_LICENSE + ) return ( has_amazon_copyright or has_chromium_copyright or has_tuntap_copyright or has_alibaba_copyright + or has_oracle_copyright ) return True