diff --git a/CHANGELOG.md b/CHANGELOG.md index d243b8ef923..eef7641f0bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,10 @@ and this project adheres to - [#5175](https://github.com/firecracker-microvm/firecracker/pull/5175): Allow including a custom cpu template directly in the json configuration file passed to `--config-file` under the `cpu_config` key. +- [#5274](https://github.com/firecracker-microvm/firecracker/pull/5274): Allow + taking diff snapshots even if dirty page tracking is disabled, by using + `mincore(2)` to overapproximate the set of dirty pages. Only works if swap is + disabled. ### Changed @@ -25,6 +29,10 @@ and this project adheres to ### Deprecated +- [#5274](https://github.com/firecracker-microvm/firecracker/pull/5274): + Deprecated the `enable_diff_snapshots` parameter of the `/snapshot/load` API. + Use `track_dirty_pages` instead. + ### Removed ### Fixed diff --git a/DEPRECATED.md b/DEPRECATED.md index e20e4af628a..4e53c0e31ac 100644 --- a/DEPRECATED.md +++ b/DEPRECATED.md @@ -21,3 +21,5 @@ a future major Firecracker release, in accordance with our The functionality is substituted with ACPI. - \[[#2628](https://github.com/firecracker-microvm/firecracker/pull/2628)\] The `--basic` parameter of `seccompiler-bin`. +- \[[#5274](https://github.com/firecracker-microvm/firecracker/pull/5274)\]: The + `enable_diff_snapshots` body field in `PUT` requests on `/snapshot/load` diff --git a/docs/device-api.md b/docs/device-api.md index dd2b5e4c9f6..64f5b23c477 100644 --- a/docs/device-api.md +++ b/docs/device-api.md @@ -37,68 +37,68 @@ All input schema fields can be found in the [Swagger](https://swagger.io) specification: [firecracker.yaml](./../src/firecracker/swagger/firecracker.yaml). -| Schema | Property | keyboard | serial console | virtio-block | vhost-user-block | virtio-net | virtio-vsock | virtio-rng | -| ------------------------- | --------------------- | :------: | :------------: | :----------: | :--------------: | :--------: | :----------: | :--------: | -| `BootSource` | boot_args | O | O | O | O | O | O | O | -| | initrd_path | O | O | O | O | O | O | O | -| | kernel_image_path | O | O | O | O | O | O | O | -| `CpuConfig` | cpuid_modifiers | O | O | O | O | O | O | O | -| | msr_modifiers | O | O | O | O | O | O | O | -| | reg_modifiers | O | O | O | O | O | O | O | -| `CpuTemplate` | enum | O | O | O | O | O | O | O | -| `CreateSnapshotParams` | mem_file_path | O | O | O | O | O | O | O | -| | snapshot_path | O | O | O | O | O | O | O | -| | snapshot_type | O | O | O | O | O | O | O | -| | version | O | O | O | O | O | O | O | -| `Drive` | drive_id \* | O | O | **R** | **R** | O | O | O | -| | is_read_only | O | O | **R** | O | O | O | O | -| | is_root_device \* | O | O | **R** | **R** | O | O | O | -| | partuuid \* | O | O | **R** | **R** | O | O | O | -| | path_on_host | O | O | **R** | O | O | O | O | -| | rate_limiter | O | O | **R** | O | O | O | O | -| | socket | O | O | O | **R** | O | O | O | -| `InstanceActionInfo` | action_type | O | O | O | O | O | O | O | -| `LoadSnapshotParams` | enable_diff_snapshots | O | O | O | O | O | O | O | -| | mem_file_path | O | O | O | O | O | O | O | -| | mem_backend | O | O | O | O | O | O | O | -| | snapshot_path | O | O | O | O | O | O | O | -| | resume_vm | O | O | O | O | O | O | O | -| `Logger` | level | O | O | O | O | O | O | O | -| | log_path | O | O | O | O | O | O | O | -| | show_level | O | O | O | O | O | O | O | -| | show_log_origin | O | O | O | O | O | O | O | -| `MachineConfiguration` | cpu_template | O | O | O | O | O | O | O | -| | smt | O | O | O | O | O | O | O | -| | mem_size_mib | O | O | O | O | O | O | O | -| | track_dirty_pages | O | O | O | O | O | O | O | -| | vcpu_count | O | O | O | O | O | O | O | -| `Metrics` | metrics_path | O | O | O | O | O | O | O | -| `MmdsConfig` | network_interfaces | O | O | O | O | **R** | O | O | -| | version | O | O | O | O | **R** | O | O | -| | ipv4_address | O | O | O | O | **R** | O | O | -| `NetworkInterface` | guest_mac | O | O | O | O | **R** | O | O | -| | host_dev_name | O | O | O | O | **R** | O | O | -| | iface_id | O | O | O | O | **R** | O | O | -| | rx_rate_limiter | O | O | O | O | **R** | O | O | -| | tx_rate_limiter | O | O | O | O | **R** | O | O | -| `PartialDrive` | drive_id | O | O | **R** | O | O | O | O | -| | path_on_host | O | O | **R** | O | O | O | O | -| `PartialNetworkInterface` | iface_id | O | O | O | O | **R** | O | O | -| | rx_rate_limiter | O | O | O | O | **R** | O | O | -| | tx_rate_limiter | O | O | O | O | **R** | O | O | -| `RateLimiter` | bandwidth | O | O | O | O | **R** | O | O | -| | ops | O | O | **R** | O | O | O | O | -| `TokenBucket` \*\* | one_time_burst | O | O | **R** | O | O | O | O | -| | refill_time | O | O | **R** | O | O | O | O | -| | size | O | O | **R** | O | O | O | O | -| `TokenBucket` \*\* | one_time_burst | O | O | O | O | **R** | O | O | -| | refill_time | O | O | O | O | **R** | O | O | -| | size | O | O | O | O | **R** | O | O | -| `Vm` | state | O | O | O | O | O | O | O | -| `Vsock` | guest_cid | O | O | O | O | O | **R** | O | -| | uds_path | O | O | O | O | O | **R** | O | -| | vsock_id | O | O | O | O | O | **R** | O | -| `EntropyDevice` | rate_limiter | O | O | O | O | O | O | **R** | +| Schema | Property | keyboard | serial console | virtio-block | vhost-user-block | virtio-net | virtio-vsock | virtio-rng | +| ------------------------- | ------------------ | :------: | :------------: | :----------: | :--------------: | :--------: | :----------: | :--------: | +| `BootSource` | boot_args | O | O | O | O | O | O | O | +| | initrd_path | O | O | O | O | O | O | O | +| | kernel_image_path | O | O | O | O | O | O | O | +| `CpuConfig` | cpuid_modifiers | O | O | O | O | O | O | O | +| | msr_modifiers | O | O | O | O | O | O | O | +| | reg_modifiers | O | O | O | O | O | O | O | +| `CpuTemplate` | enum | O | O | O | O | O | O | O | +| `CreateSnapshotParams` | mem_file_path | O | O | O | O | O | O | O | +| | snapshot_path | O | O | O | O | O | O | O | +| | snapshot_type | O | O | O | O | O | O | O | +| | version | O | O | O | O | O | O | O | +| `Drive` | drive_id \* | O | O | **R** | **R** | O | O | O | +| | is_read_only | O | O | **R** | O | O | O | O | +| | is_root_device \* | O | O | **R** | **R** | O | O | O | +| | partuuid \* | O | O | **R** | **R** | O | O | O | +| | path_on_host | O | O | **R** | O | O | O | O | +| | rate_limiter | O | O | **R** | O | O | O | O | +| | socket | O | O | O | **R** | O | O | O | +| `InstanceActionInfo` | action_type | O | O | O | O | O | O | O | +| `LoadSnapshotParams` | track_dirty_pages | O | O | O | O | O | O | O | +| | mem_file_path | O | O | O | O | O | O | O | +| | mem_backend | O | O | O | O | O | O | O | +| | snapshot_path | O | O | O | O | O | O | O | +| | resume_vm | O | O | O | O | O | O | O | +| `Logger` | level | O | O | O | O | O | O | O | +| | log_path | O | O | O | O | O | O | O | +| | show_level | O | O | O | O | O | O | O | +| | show_log_origin | O | O | O | O | O | O | O | +| `MachineConfiguration` | cpu_template | O | O | O | O | O | O | O | +| | smt | O | O | O | O | O | O | O | +| | mem_size_mib | O | O | O | O | O | O | O | +| | track_dirty_pages | O | O | O | O | O | O | O | +| | vcpu_count | O | O | O | O | O | O | O | +| `Metrics` | metrics_path | O | O | O | O | O | O | O | +| `MmdsConfig` | network_interfaces | O | O | O | O | **R** | O | O | +| | version | O | O | O | O | **R** | O | O | +| | ipv4_address | O | O | O | O | **R** | O | O | +| `NetworkInterface` | guest_mac | O | O | O | O | **R** | O | O | +| | host_dev_name | O | O | O | O | **R** | O | O | +| | iface_id | O | O | O | O | **R** | O | O | +| | rx_rate_limiter | O | O | O | O | **R** | O | O | +| | tx_rate_limiter | O | O | O | O | **R** | O | O | +| `PartialDrive` | drive_id | O | O | **R** | O | O | O | O | +| | path_on_host | O | O | **R** | O | O | O | O | +| `PartialNetworkInterface` | iface_id | O | O | O | O | **R** | O | O | +| | rx_rate_limiter | O | O | O | O | **R** | O | O | +| | tx_rate_limiter | O | O | O | O | **R** | O | O | +| `RateLimiter` | bandwidth | O | O | O | O | **R** | O | O | +| | ops | O | O | **R** | O | O | O | O | +| `TokenBucket` \*\* | one_time_burst | O | O | **R** | O | O | O | O | +| | refill_time | O | O | **R** | O | O | O | O | +| | size | O | O | **R** | O | O | O | O | +| `TokenBucket` \*\* | one_time_burst | O | O | O | O | **R** | O | O | +| | refill_time | O | O | O | O | **R** | O | O | +| | size | O | O | O | O | **R** | O | O | +| `Vm` | state | O | O | O | O | O | O | O | +| `Vsock` | guest_cid | O | O | O | O | O | **R** | O | +| | uds_path | O | O | O | O | O | **R** | O | +| | vsock_id | O | O | O | O | O | **R** | O | +| `EntropyDevice` | rate_limiter | O | O | O | O | O | O | **R** | \* `Drive`'s `drive_id`, `is_root_device` and `partuuid` can be configured by either virtio-block or vhost-user-block devices. diff --git a/docs/hugepages.md b/docs/hugepages.md index a6f18e06dd2..17cd5cc9b43 100644 --- a/docs/hugepages.md +++ b/docs/hugepages.md @@ -24,9 +24,7 @@ pool, please refer to the [Linux Documentation][hugetlbfs_docs]. Restoring a Firecracker snapshot of a microVM backed by huge pages will also use huge pages to back the restored guest. There is no option to flip between regular, 4K, pages and huge pages at restore time. Furthermore, snapshots of -microVMs backed with huge pages can only be restored via UFFD. Lastly, note that -even for guests backed by huge pages, differential snapshots will always track -write accesses to guest memory at 4K granularity. +microVMs backed with huge pages can only be restored via UFFD. When restoring snapshots via UFFD, Firecracker will send the configured page size (in KiB) for each memory region as part of the initial handshake, as @@ -40,12 +38,17 @@ Firecracker features: - Memory Ballooning via the [Balloon Device](./ballooning.md) +Furthermore, enabling dirty page tracking for hugepage memory negates the +performance benefits of using huge pages. This is because KVM will +unconditionally establish guest page tables at 4K granularity if dirty page +tracking is enabled, even if the host users huge mappings. + ## FAQ ### Why does Firecracker not offer a transparent huge pages (THP) setting? -Firecracker's guest memory is memfd based. Linux (as of 6.1) does not offer a -way to dynamically enable THP for such memory regions. Additionally, UFFD does +Firecracker's guest memory can be memfd based. Linux (as of 6.1) does not offer +a way to dynamically enable THP for such memory regions. Additionally, UFFD does not integrate with THP (no transparent huge pages will be allocated during userfaulting). Please refer to the [Linux Documentation][thp_docs] for more information. diff --git a/docs/snapshotting/snapshot-support.md b/docs/snapshotting/snapshot-support.md index f6bcd5bb883..421f6dcf583 100644 --- a/docs/snapshotting/snapshot-support.md +++ b/docs/snapshotting/snapshot-support.md @@ -122,7 +122,7 @@ the feature can be combined with guest_memfd support in Firecracker. ### Limitations -- High snapshot latency on 5.4+ host kernels due to cgroups V1. We strongly +- High snapshot restoration latency when cgroups V1 are in use. We strongly recommend to deploy snapshots on cgroups V2 enabled hosts for the implied kernel versions - [related issue](https://github.com/firecracker-microvm/firecracker/issues/2129). @@ -145,10 +145,11 @@ the feature can be combined with guest_memfd support in Firecracker. resumed from snapshot load memory on-demand from the snapshot and copy-on-write to anonymous memory. - Resuming from a snapshot is optimized for speed, while taking a snapshot - involves some extra CPU cycles for synchronously writing dirty memory pages to - the memory snapshot file. Taking a snapshot of a fresh microVM, on which dirty - pages tracking is not enabled, results in the full contents of guest memory - being written to the snapshot. + involves some extra CPU cycles for synchronously writing memory pages to the + memory snapshot file. Taking a full snapshot of a microVM, on which dirty page + tracking is not enabled, results in the full contents of guest memory being + written to the snapshot, and particularly, in all guest memory being faulted + in. - The _memory file_ and _microVM state file_ are generated by Firecracker on snapshot creation. The disk contents are _not_ explicitly flushed to their backing files. @@ -207,23 +208,17 @@ the microVM in the `Paused` state. **Effects**: Now that the microVM is paused, you can create a snapshot, which can be either a `full`one or a `diff` one. Full snapshots always create a complete, resume-able snapshot of the current microVM state and memory. Diff snapshots save the -current microVM state and the memory dirtied since the last snapshot (full or -diff). Diff snapshots are not resume-able, but can be merged into a full -snapshot. In this context, we will refer to the base as the first memory file -created by a `/snapshot/create` API call and the layer as a memory file created -by a subsequent `/snapshot/create` API call. The order in which the snapshots -were created matters and they should be merged in the same order in which they -were created. To merge a `diff` snapshot memory file on top of a base, users -should copy its content over the base. This can be done using the `rebase-snap` -(deprecated) or `snapshot-editor` tools provided with the firecracker release: - -`rebase-snap` (deprecated) example: - -```bash -rebase-snap --base-file path/to/base --diff-file path/to/layer -``` - -`snapshot-editor` example: +current microVM state and the memory accessed since the last snapshot (full or +diff). The result of a diff snapshot will be a sparse file, with only accessed +pages written (and other ranges becoming holes). Diff snapshots are not +resume-able, but can be merged into a full snapshot. In this context, we will +refer to the base as the first memory file created by a `/snapshot/create` API +call and the layer as a memory file created by a subsequent `/snapshot/create` +API call. The order in which the snapshots were created matters and they should +be merged in the same order in which they were created. To merge a `diff` +snapshot memory file on top of a base, users should copy its content over the +base. This can be done using the `snapshot-editor` tools provided with the +firecracker release: ```bash snapshot-editor edit-memory rebase \ @@ -281,9 +276,9 @@ the snapshot. If they exist, the files will be truncated and overwritten. contents are only guaranteed to be committed/flushed to the host FS, but not necessarily to the underlying persistent storage (could still live in host FS cache). - - If diff snapshots were enabled, the snapshot creation resets then the - dirtied page bitmap and marks all pages clean (from a diff snapshot point of - view). + - If dirty page tracking is enabled, the snapshot creation resets then the + dirtied page bitmap and marks all pages clean (from a dirty page tracking + point of view). - _on failure_: no side-effects. @@ -313,10 +308,23 @@ curl --unix-socket /tmp/firecracker.socket -i \ **Prerequisites**: The microVM is `Paused`. -*Note*: On a fresh microVM, `track_dirty_pages` field should be set to `true`, -when configuring the `/machine-config` resource, while on a snapshot loaded -microVM, `enable_diff_snapshots` from `PUT /snapshot/load`request body, should -be set. +*Note*: Diff snapshots come in two flavor. If `track_dirty_pages` was set to +`true` when configuring the `/machine-config` resource or when restoring from a +snapshot via `/snapshot/load`, Firecracker will use KVM's dirty page log runtime +functionality to ensure the diff snapshot only contains exactly pages that were +written to since boot / snapshot restoration. If `track_dirty_pages` is not +enabled, Firecracker will instead over-approximate the set of pages to include +in the snapshot by instead considering all pages that were _accessed_ during the +VM's lifetime. This potentially results in bigger memory files (although they +are still sparse), but avoids the runtime overhead of dirty page logging. + +*Note*: Dirty page tracking negates most of the benefits of +[huge pages](../hugepages.md#known-limitations). + +Without dirty page tracking enabled, Firecracker uses the +[`mincore(2)`][man mincore] syscall to determine which pages to include in the +snapshot. As such, this mode of snapshot taking will only work _if swap is +disabled_, as mincore does not consider pages written to swap to be "in core". **Effects**: @@ -350,10 +358,12 @@ Enabling this support enables KVM dirty page tracking, so it comes at a cost (which consists of CPU cycles spent by KVM accounting for dirtied pages); it should only be used when needed. -Creating a snapshot will **not** influence state, will **not** stop or end the -microVM, it can be used as before, so the microVM can be resumed if you still -want to use it. At this point, in case you plan to continue using the current -microVM, you should make sure to also copy the disk backing files. +Creating a snapshot has some minor effects on the currently running microVM: + +- The vsock device is [reset](#vsock-device-reset), causing the driver to + terminate connection on resumption. +- On x86_64, a notification for KVM-clock is injected to notify the guest about + being paused. ### Resuming the microVM @@ -378,8 +388,8 @@ ignored (microVM remains in the running state). **Effects**: ### Loading snapshots If you want to load a snapshot, you can do that only **before** the microVM is -configured (the only resources that can be configured prior are the Logger and -the Metrics systems) by sending the following API command: +configured (the only resources that can be configured prior are the logger and +the metrics systems) by sending the following API command: ```bash curl --unix-socket /tmp/firecracker.socket -i \ @@ -392,7 +402,7 @@ curl --unix-socket /tmp/firecracker.socket -i \ "backend_path": "./mem_file", "backend_type": "File" }, - "enable_diff_snapshots": true, + "track_dirty_pages": true, "resume_vm": false }' ``` @@ -428,7 +438,7 @@ curl --unix-socket /tmp/firecracker.socket -i \ -d '{ "snapshot_path": "./snapshot_file", "mem_file_path": "./mem_file", - "enable_diff_snapshots": true, + "track_dirty_pages": true, "resume_vm": false }' ``` @@ -459,35 +469,17 @@ to the new Firecracker process as they were to the original one. the guest memory and leads to undefined behavior. - The file indicated by `snapshot_path`, that is used to load from, is released and no longer used by this process. - - If `enable_diff_snapshots` is set, then diff snapshots can be taken - afterwards. + - If `track_dirty_pages` is set, subsequent diff snapshots will be based on + KVM dirty page tracking. - If `resume_vm` is set, the vm is automatically resumed if load is successful. - _on failure_: A specific error is reported and then the current Firecracker process is ended (as it might be in an invalid state). -*Notes*: Please, keep in mind that only by setting to true -`enable_diff_snapshots`, when loading a snapshot, or `track_dirty_pages`, when -configuring the machine on a fresh microVM, you can then create a `diff` -snapshot. Also, `track_dirty_pages` is not saved when creating a snapshot, so -you need to explicitly set `enable_diff_snapshots` when sending -`LoadSnapshot`command if you want to be able to do diff snapshots from a loaded -microVM. Another thing that you should be aware of is the following: if a fresh -microVM can create diff snapshots, then if you create a **full** snapshot, the -memory file contains the whole guest memory, while if you create a **diff** one, -that file is sparse and only contains the guest dirtied pages. With these in -mind, some possible snapshotting scenarios are the following: - -- `Boot from a fresh microVM` -> `Pause` -> `Create snapshot` -> `Resume` -> - `Pause` -> `Create snapshot` -> ... ; -- `Boot from a fresh microVM` -> `Pause` -> `Create snapshot` -> `Resume` -> - `Pause` -> `Resume` -> ... -> `Pause` -> `Create snapshot` -> ... ; -- `Load snapshot` -> `Resume` -> `Pause` -> `Create snapshot` -> `Resume` -> - `Pause` -> `Create snapshot` -> ... ; -- `Load snapshot` -> `Resume` -> `Pause` -> `Create snapshot` -> `Resume` -> - `Pause` -> `Resume` -> ... -> `Pause` -> `Create snapshot` -> ... ; where - `Create snapshot` can refer to either a full or a diff snapshot for all the - aforementioned flows. +*Notes*: The `track_dirty_pages` configuration is not saved when creating a +snapshot, so you need to explicitly set `track_dirty_pages` again when sending +the `LoadSnapshot` command if you want to be able to do dirty page tracking +based diff snapshots from a loaded microVM. It is also worth knowing, a microVM that is restored from snapshot will be resumed with the guest OS wall-clock continuing from the moment of the snapshot @@ -632,3 +624,5 @@ the compatibility table reported below: For example, a snapshot taken on a m6i.metal host running a 5.10 host kernel can be restored on a different m6i.metal host running a 6.1 host kernel (but not vice versa), but could not be restored on a c5n.metal host. + +[man mincore]: https://man7.org/linux/man-pages/man2/mincore.2.html diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index db3abe1eced..8a3dac13673 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -28,6 +28,9 @@ { "syscall": "write" }, + { + "syscall": "mincore" + }, { "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 95ceca1b7ef..c3462d2f86b 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -28,6 +28,9 @@ { "syscall": "write" }, + { + "syscall": "mincore" + }, { "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 4a96292d11d..8284aa66287 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -13,7 +13,8 @@ use super::super::parsed_request::{ParsedRequest, RequestError}; use super::super::request::{Body, Method, StatusCode}; /// Deprecation message for the `mem_file_path` field. -const LOAD_DEPRECATION_MESSAGE: &str = "PUT /snapshot/load: mem_file_path field is deprecated."; +const LOAD_DEPRECATION_MESSAGE: &str = + "PUT /snapshot/load: mem_file_path and enable_diff_snapshots fields are deprecated."; /// None of the `mem_backend` or `mem_file_path` fields has been specified. pub const MISSING_FIELD: &str = "missing field: either `mem_backend` or `mem_file_path` is required"; @@ -80,7 +81,8 @@ fn parse_put_snapshot_load(body: &Body) -> Result { // Check for the presence of deprecated `mem_file_path` field and create // deprecation message if found. let mut deprecation_message = None; - if snapshot_config.mem_file_path.is_some() { + #[allow(deprecated)] + if snapshot_config.mem_file_path.is_some() || snapshot_config.enable_diff_snapshots { // `mem_file_path` field in request is deprecated. METRICS.deprecated_api.deprecated_http_api_calls.inc(); deprecation_message = Some(LOAD_DEPRECATION_MESSAGE); @@ -103,7 +105,9 @@ fn parse_put_snapshot_load(body: &Body) -> Result { let snapshot_params = LoadSnapshotParams { snapshot_path: snapshot_config.snapshot_path, mem_backend, - enable_diff_snapshots: snapshot_config.enable_diff_snapshots, + #[allow(deprecated)] + track_dirty_pages: snapshot_config.enable_diff_snapshots + || snapshot_config.track_dirty_pages, resume_vm: snapshot_config.resume_vm, network_overrides: snapshot_config.network_overrides, }; @@ -180,7 +184,7 @@ mod tests { backend_path: PathBuf::from("bar"), backend_type: MemBackendType::File, }, - enable_diff_snapshots: false, + track_dirty_pages: false, resume_vm: false, network_overrides: vec![], }; @@ -202,7 +206,7 @@ mod tests { "backend_path": "bar", "backend_type": "File" }, - "enable_diff_snapshots": true + "track_dirty_pages": true }"#; let expected_config = LoadSnapshotParams { snapshot_path: PathBuf::from("foo"), @@ -210,7 +214,7 @@ mod tests { backend_path: PathBuf::from("bar"), backend_type: MemBackendType::File, }, - enable_diff_snapshots: true, + track_dirty_pages: true, resume_vm: false, network_overrides: vec![], }; @@ -240,7 +244,7 @@ mod tests { backend_path: PathBuf::from("bar"), backend_type: MemBackendType::Uffd, }, - enable_diff_snapshots: false, + track_dirty_pages: false, resume_vm: true, network_overrides: vec![], }; @@ -276,7 +280,7 @@ mod tests { backend_path: PathBuf::from("bar"), backend_type: MemBackendType::Uffd, }, - enable_diff_snapshots: false, + track_dirty_pages: false, resume_vm: true, network_overrides: vec![NetworkOverride { iface_id: String::from("eth0"), @@ -306,7 +310,7 @@ mod tests { backend_path: PathBuf::from("bar"), backend_type: MemBackendType::File, }, - enable_diff_snapshots: false, + track_dirty_pages: false, resume_vm: true, network_overrides: vec![], }; diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 61a0057a1ad..f5c91a8ddbe 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1245,7 +1245,11 @@ definitions: enable_diff_snapshots: type: boolean description: - Enable support for incremental (diff) snapshots by tracking dirty guest pages. + (Deprecated) Enable dirty page tracking to improve space efficiency of diff snapshots + track_dirty_pages: + type: boolean + description: + Enable dirty page tracking to improve space efficiency of diff snapshots mem_file_path: type: string description: diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 4699b80b185..a0a7e167587 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -347,7 +347,7 @@ pub fn restore_from_snapshot( return Err(SnapshotStateFromFileError::UnknownNetworkDevice.into()); } } - let track_dirty_pages = params.enable_diff_snapshots; + let track_dirty_pages = params.track_dirty_pages; let vcpu_count = microvm_state .vcpu_states diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index d868c022dd2..7fc9879c512 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -747,13 +747,6 @@ impl RuntimeApiController { ) -> Result { if create_params.snapshot_type == SnapshotType::Diff { log_dev_preview_warning("Virtual machine diff snapshots", None); - - if !self.vm_resources.machine_config.track_dirty_pages { - return Err(VmmActionError::NotSupported( - "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." - .to_string(), - )); - } } let mut locked_vmm = self.vmm.lock().unwrap(); @@ -1256,7 +1249,7 @@ mod tests { backend_type: MemBackendType::File, backend_path: PathBuf::new(), }, - enable_diff_snapshots: false, + track_dirty_pages: false, resume_vm: false, network_overrides: vec![], }, diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 7cb16a2a213..ae2c4a9bd3b 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -113,7 +113,6 @@ pub fn default_vmm_no_boot(kernel_image: Option<&str>) -> (Arc>, Even create_vmm(kernel_image, false, false) } -#[cfg(target_arch = "x86_64")] pub fn dirty_tracking_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { create_vmm(kernel_image, true, true) } diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 27a7841d5a4..6be34333b43 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -64,9 +64,9 @@ pub struct LoadSnapshotParams { pub snapshot_path: PathBuf, /// Specifies guest memory backend configuration. pub mem_backend: MemBackendConfig, - /// Setting this flag will enable KVM dirty page tracking and will - /// allow taking subsequent incremental snapshots. - pub enable_diff_snapshots: bool, + /// Whether KVM dirty page tracking should be abled, to space optimization + /// of differential snapshots. + pub track_dirty_pages: bool, /// When set to true, the vm is also resumed if the snapshot load /// is successful. pub resume_vm: bool, @@ -90,7 +90,11 @@ pub struct LoadSnapshotConfig { pub mem_backend: Option, /// Whether or not to enable KVM dirty page tracking. #[serde(default)] + #[deprecated] pub enable_diff_snapshots: bool, + /// Whether KVM dirty page tracking should be enabled. + #[serde(default)] + pub track_dirty_pages: bool, /// Whether or not to resume the vm post snapshot load. #[serde(default)] pub resume_vm: bool, diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 7a8965a4b9a..1b48a59d6c7 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -15,6 +15,7 @@ use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; use kvm_ioctls::VmFd; use vmm_sys_util::eventfd::EventFd; +use crate::arch::host_page_size; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::logger::info; use crate::persist::CreateSnapshotError; @@ -204,6 +205,9 @@ impl Vm { .try_for_each(|(region, slot)| { self.fd() .get_dirty_log(slot, u64_to_usize(region.len())) + // Getting the dirty log failed. This is probably because dirty page tracking + // was disabled. Fall back to mincore in this case. + .or_else(|_| mincore_bitmap(region)) .map(|bitmap_region| _ = bitmap.insert(slot, bitmap_region)) })?; Ok(bitmap) @@ -278,6 +282,47 @@ impl Vm { } } +/// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used +/// if a diff snapshot is requested, but dirty page tracking wasn't enabled. +fn mincore_bitmap(region: &GuestRegionMmap) -> Result, vmm_sys_util::errno::Error> { + // TODO: Once Host 5.10 goes out of support, we can make this more robust and work on + // swap-enabled systems, by doing mlock2(MLOCK_ONFAULT)/munlock() in this function (to + // force swapped-out pages to get paged in, so that mincore will consider them incore). + // However, on AMD (m6a/m7a) 5.10, doing so introduces a 100%/30ms regression to snapshot + // creation, even if swap is disabled, so currently it cannot be done. + + // Mincore always works at PAGE_SIZE granularity, even if the VMA we are dealing with + // is a hugetlbfs VMA (e.g. to report a single hugepage as "present", mincore will + // give us 512 4k markers with the lowest bit set). + let page_size = host_page_size(); + let mut mincore_bitmap = vec![0u8; u64_to_usize(region.len()) / page_size]; + let mut bitmap = vec![0u64; u64_to_usize(region.len()) / page_size]; + + // SAFETY: The safety invariants of GuestRegionMmap ensure that region.as_ptr() is a valid + // userspace mapping of size region.len() bytes. The bitmap has exactly one byte for each + // page in this userspace mapping. Note that mincore does not operate on bitmaps like + // KVM_MEM_LOG_DIRTY_PAGES, but rather it uses 8 bits per page (e.g. 1 byte), setting the + // least significant bit to 1 if the page corresponding to a byte is in core (available in + // the page cache and resolvable via just a minor page fault). + let r = unsafe { + libc::mincore( + region.as_ptr().cast::(), + u64_to_usize(region.len()), + mincore_bitmap.as_mut_ptr(), + ) + }; + + if r != 0 { + return vmm_sys_util::errno::errno_result(); + } + + for (page_idx, b) in mincore_bitmap.iter().enumerate() { + bitmap[page_idx / 64] |= (*b as u64 & 0x1) << (page_idx as u64 % 64); + } + + Ok(bitmap) +} + #[cfg(test)] pub(crate) mod tests { use vm_memory::GuestAddress; diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 55fb07c1aae..633a8598574 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -100,47 +100,30 @@ fn test_pause_resume_microvm() { vmm.lock().unwrap().stop(FcExitCode::Ok); } -#[test] -fn test_dirty_bitmap_error() { - // Error case: dirty tracking disabled. - let (vmm, _) = default_vmm(None); - - // The vmm will start with dirty page tracking = OFF. - // With dirty tracking disabled, the underlying KVM_GET_DIRTY_LOG ioctl will fail - // with errno 2 (ENOENT) because KVM can't find any guest memory regions with dirty - // page tracking enabled. - assert_eq!( - vmm.lock() - .unwrap() - .vm - .get_dirty_bitmap() - .unwrap_err() - .errno(), - 2 - ); - vmm.lock().unwrap().stop(FcExitCode::Ok); -} - #[test] #[cfg(target_arch = "x86_64")] fn test_dirty_bitmap_success() { - // The vmm will start with dirty page tracking = ON. - let (vmm, _) = vmm::test_utils::dirty_tracking_vmm(Some(NOISY_KERNEL_IMAGE)); - - // Let it churn for a while and dirty some pages... - thread::sleep(Duration::from_millis(100)); - let bitmap = vmm.lock().unwrap().vm.get_dirty_bitmap().unwrap(); - let num_dirty_pages: u32 = bitmap - .values() - .map(|bitmap_per_region| { - // Gently coerce to u32 - let num_dirty_pages_per_region: u32 = - bitmap_per_region.iter().map(|n| n.count_ones()).sum(); - num_dirty_pages_per_region - }) - .sum(); - assert!(num_dirty_pages > 0); - vmm.lock().unwrap().stop(FcExitCode::Ok); + let vms = [ + vmm::test_utils::dirty_tracking_vmm(Some(NOISY_KERNEL_IMAGE)), + default_vmm(Some(NOISY_KERNEL_IMAGE)), + ]; + + for (vmm, _) in vms { + // Let it churn for a while and dirty some pages... + thread::sleep(Duration::from_millis(100)); + let bitmap = vmm.lock().unwrap().vm.get_dirty_bitmap().unwrap(); + let num_dirty_pages: u32 = bitmap + .values() + .map(|bitmap_per_region| { + // Gently coerce to u32 + let num_dirty_pages_per_region: u32 = + bitmap_per_region.iter().map(|n| n.count_ones()).sum(); + num_dirty_pages_per_region + }) + .sum(); + assert!(num_dirty_pages > 0); + vmm.lock().unwrap().stop(FcExitCode::Ok); + } } #[test] @@ -262,7 +245,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { backend_path: memory_file.as_path().to_path_buf(), backend_type: MemBackendType::File, }, - enable_diff_snapshots: false, + track_dirty_pages: false, resume_vm: true, network_overrides: vec![], })) @@ -346,7 +329,7 @@ fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &s backend_path: memory_file.as_path().to_path_buf(), backend_type: MemBackendType::File, }, - enable_diff_snapshots: false, + track_dirty_pages: false, resume_vm: false, network_overrides: vec![], }); diff --git a/tests/conftest.py b/tests/conftest.py index 1b5dcafe713..94d948bb7ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -416,7 +416,9 @@ def io_engine(request): return request.param -@pytest.fixture(params=[SnapshotType.DIFF, SnapshotType.FULL]) +@pytest.fixture( + params=[SnapshotType.DIFF, SnapshotType.DIFF_MINCORE, SnapshotType.FULL] +) def snapshot_type(request): """All possible snapshot types""" return request.param diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index a6582515491..72d167bd921 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -22,7 +22,7 @@ import uuid from collections import namedtuple from dataclasses import dataclass -from enum import Enum +from enum import Enum, auto from functools import lru_cache from pathlib import Path from typing import Optional @@ -49,13 +49,33 @@ class SnapshotType(Enum): """Supported snapshot types.""" - FULL = "Full" - DIFF = "Diff" + FULL = auto() + DIFF = auto() + DIFF_MINCORE = auto() def __repr__(self): cls_name = self.__class__.__name__ return f"{cls_name}.{self.name}" + @property + def needs_rebase(self) -> bool: + """Does this snapshot type need rebasing on top of a base snapshot before restoration?""" + return self in [SnapshotType.DIFF, SnapshotType.DIFF_MINCORE] + + @property + def needs_dirty_page_tracking(self) -> bool: + """Does taking this snapshot type require dirty page tracking to be enabled?""" + return self == SnapshotType.DIFF + + @property + def api_type(self) -> str: + """Converts this `SnapshotType` to the string value expected by the Firecracker API""" + match self: + case SnapshotType.FULL: + return "Full" + case SnapshotType.DIFF | SnapshotType.DIFF_MINCORE: + return "Diff" + def hardlink_or_copy(src, dst): """If src and dst are in the same device, hardlink. Otherwise, copy.""" @@ -79,15 +99,10 @@ class Snapshot: snapshot_type: SnapshotType meta: dict - @property - def is_diff(self) -> bool: - """Is this a DIFF snapshot?""" - return self.snapshot_type == SnapshotType.DIFF - def rebase_snapshot(self, base, use_snapshot_editor=False): """Rebases current incremental snapshot onto a specified base layer.""" - if not self.is_diff: - raise ValueError("Can only rebase DIFF snapshots") + if not self.snapshot_type.needs_rebase: + raise ValueError(f"Cannot rebase {self.snapshot_type}") if use_snapshot_editor: build_tools.run_snap_editor_rebase(base.mem, self.mem) else: @@ -962,7 +977,7 @@ def resume(self): def make_snapshot( self, - snapshot_type: SnapshotType | str, + snapshot_type: SnapshotType, *, mem_path: str = "mem", vmstate_path="vmstate", @@ -974,12 +989,11 @@ def make_snapshot( It pauses the microvm before taking the snapshot. """ - snapshot_type = SnapshotType(snapshot_type) self.pause() self.api.snapshot_create.put( mem_file_path=str(mem_path), snapshot_path=str(vmstate_path), - snapshot_type=snapshot_type.value, + snapshot_type=snapshot_type.api_type, ) root = Path(self.chroot()) return Snapshot( @@ -997,11 +1011,15 @@ def make_snapshot( def snapshot_diff(self, *, mem_path: str = "mem", vmstate_path="vmstate"): """Make a Diff snapshot""" - return self.make_snapshot("Diff", mem_path=mem_path, vmstate_path=vmstate_path) + return self.make_snapshot( + SnapshotType.DIFF, mem_path=mem_path, vmstate_path=vmstate_path + ) def snapshot_full(self, *, mem_path: str = "mem", vmstate_path="vmstate"): """Make a Full snapshot""" - return self.make_snapshot("Full", mem_path=mem_path, vmstate_path=vmstate_path) + return self.make_snapshot( + SnapshotType.FULL, mem_path=mem_path, vmstate_path=vmstate_path + ) def restore_from_snapshot( self, @@ -1067,7 +1085,7 @@ def restore_from_snapshot( self.api.snapshot_load.put( mem_backend=mem_backend, snapshot_path=str(jailed_vmstate), - enable_diff_snapshots=jailed_snapshot.is_diff, + enable_diff_snapshots=jailed_snapshot.snapshot_type.needs_dirty_page_tracking, resume_vm=resume, **optional_kwargs, ) @@ -1224,12 +1242,15 @@ def build_n_from_snapshot( if incremental: # When doing diff snapshots, we continuously overwrite the same base snapshot file from the first # iteration in-place with successive snapshots, so don't delete it! - if last_snapshot is not None and not last_snapshot.is_diff: + if ( + last_snapshot is not None + and not last_snapshot.snapshot_type.needs_rebase + ): last_snapshot.delete() next_snapshot = microvm.make_snapshot(current_snapshot.snapshot_type) - if current_snapshot.is_diff: + if current_snapshot.snapshot_type.needs_rebase: next_snapshot = next_snapshot.rebase_snapshot( current_snapshot, use_snapshot_editor ) @@ -1240,7 +1261,7 @@ def build_n_from_snapshot( microvm.kill() snapshot_copy.delete() - if last_snapshot is not None and not last_snapshot.is_diff: + if last_snapshot is not None and not last_snapshot.snapshot_type.needs_rebase: last_snapshot.delete() current_snapshot.delete() diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 2af89b66a69..2b786ea16ae 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -19,7 +19,6 @@ import host_tools.drive as drive_tools import host_tools.network as net_tools from framework import utils -from framework.microvm import SnapshotType from framework.properties import global_props from framework.utils import check_filesystem, check_output from framework.utils_vsock import ( @@ -76,6 +75,7 @@ def test_resume(uvm_nano, microvm_factory, resume_at_restore): assert restored_vm.state == "Paused" restored_vm.resume() assert restored_vm.state == "Running" + restored_vm.ssh.check_output("true") def test_snapshot_current_version(uvm_nano): @@ -131,14 +131,13 @@ def test_cycled_snapshot_restore( cycles = 3 logger = logging.getLogger("snapshot_sequence") - diff_snapshots = snapshot_type == SnapshotType.DIFF vm = microvm_factory.build(guest_kernel, rootfs) vm.spawn() vm.basic_config( vcpu_count=2, mem_size_mib=512, - track_dirty_pages=diff_snapshots, + track_dirty_pages=snapshot_type.needs_dirty_page_tracking, ) vm.set_cpu_template(cpu_template_any) vm.add_net_iface() @@ -390,21 +389,6 @@ def test_negative_snapshot_create(uvm_nano): mem_file_path="memfile", snapshot_path="statefile", snapshot_type="Full" ) - vm.api.vm.patch(state="Paused") - - # Try diff with dirty pages tracking disabled. - expected_msg = ( - "Diff snapshots are not allowed on uVMs with dirty page tracking disabled" - ) - with pytest.raises(RuntimeError, match=expected_msg): - vm.api.snapshot_create.put( - mem_file_path="memfile", snapshot_path="statefile", snapshot_type="Diff" - ) - assert not os.path.exists("statefile") - assert not os.path.exists("memfile") - - vm.kill() - def test_create_large_diff_snapshot(uvm_plain): """ diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index ae1cb595311..5083e751eb4 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -68,22 +68,30 @@ def test_hugetlbfs_boot(uvm_plain): ) -def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs): +def test_hugetlbfs_snapshot( + microvm_factory, guest_kernel_linux_5_10, rootfs, snapshot_type +): """ - Test hugetlbfs snapshot restore via uffd + Test hugetlbfs snapshot restore via + + Despite guest memory being backed by huge pages, differential snapshots still work at 4K granularity. """ ### Create Snapshot ### vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs) vm.memory_monitor = None vm.spawn() - vm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB, mem_size_mib=128) + vm.basic_config( + huge_pages=HugePagesConfig.HUGETLBFS_2MB, + mem_size_mib=128, + track_dirty_pages=snapshot_type.needs_dirty_page_tracking, + ) vm.add_net_iface() vm.start() check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") - snapshot = vm.snapshot_full() + snapshot = vm.make_snapshot(snapshot_type) vm.kill() @@ -95,46 +103,6 @@ def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs): check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") -def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain): - """ - Test hugetlbfs differential snapshot support. - - Despite guest memory being backed by huge pages, differential snapshots still work at 4K granularity. - """ - - ### Create Snapshot ### - uvm_plain.memory_monitor = None - uvm_plain.spawn() - uvm_plain.basic_config( - huge_pages=HugePagesConfig.HUGETLBFS_2MB, - mem_size_mib=128, - track_dirty_pages=True, - ) - uvm_plain.add_net_iface() - uvm_plain.start() - - # Wait for microvm to boot - - base_snapshot = uvm_plain.snapshot_diff() - uvm_plain.resume() - - # Run command to dirty some pages - uvm_plain.ssh.check_output("sync") - - snapshot_diff = uvm_plain.snapshot_diff() - snapshot_merged = snapshot_diff.rebase_snapshot(base_snapshot) - - uvm_plain.kill() - - vm = microvm_factory.build() - vm.spawn() - vm.restore_from_snapshot( - snapshot_merged, resume=True, uffd_handler_name="on_demand" - ) - - # Verify if the restored microvm works. - - @pytest.mark.parametrize("huge_pages", HugePagesConfig) def test_ept_violation_count( microvm_factory, diff --git a/tests/integration_tests/performance/test_snapshot.py b/tests/integration_tests/performance/test_snapshot.py index 6c885a6a723..55b509456a6 100644 --- a/tests/integration_tests/performance/test_snapshot.py +++ b/tests/integration_tests/performance/test_snapshot.py @@ -266,7 +266,7 @@ def test_snapshot_create_latency( vm.basic_config( vcpu_count=2, mem_size_mib=512, - track_dirty_pages=snapshot_type == SnapshotType.DIFF, + track_dirty_pages=snapshot_type.needs_dirty_page_tracking, ) vm.start() vm.pin_threads(0) @@ -275,14 +275,14 @@ def test_snapshot_create_latency( { **vm.dimensions, "performance_test": "test_snapshot_create_latency", - "snapshot_type": snapshot_type.value, + "snapshot_type": str(snapshot_type), } ) match snapshot_type: case SnapshotType.FULL: metric = "full_create_snapshot" - case SnapshotType.DIFF: + case SnapshotType.DIFF | SnapshotType.DIFF_MINCORE: metric = "diff_create_snapshot" for _ in range(ITERATIONS):