From 8fe3d11f69dcbf1535d0ed8b6ad29dafaa4b714a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 30 May 2025 18:14:21 +0200 Subject: [PATCH 01/27] chore: update kvm and vmm-sys-util dependencies We need the new KvmIrqRouting FamStruct wrapper from kvm-bindings, which though forces us to update vmm-sys-util to 0.14.0 and also bump all downstream dependencies of vmm-sys-util to use that version. Signed-off-by: Babis Chalios --- Cargo.lock | 249 ++++++++++++++++++++----------------- src/firecracker/Cargo.toml | 5 +- 2 files changed, 137 insertions(+), 117 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fcb3df086cd..ad895c46ac9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,9 +64,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.18" +version = "0.6.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" dependencies = [ "anstyle", "anstyle-parse", @@ -79,33 +79,33 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.8" +version = "3.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" +checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", @@ -246,9 +246,9 @@ checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" [[package]] name = "bumpalo" -version = "3.17.0" +version = "3.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee" [[package]] name = "byteorder" @@ -274,9 +274,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.25" +version = "1.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951" +checksum = "956a5e21988b87f372569b66183b78babf23ebc2e744b733e4350a752c4dafac" dependencies = [ "jobserver", "libc", @@ -294,9 +294,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" [[package]] name = "ciborium" @@ -348,9 +348,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.39" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd60e63e9be68e5fb56422e397cf9baddded06dae1d2e523401542383bc72a9f" +checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" dependencies = [ "clap_builder", "clap_derive", @@ -367,9 +367,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.39" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89cc6392a1f72bbeb820d71f32108f61fdaf18bc526e1d23954168a67759ef51" +checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" dependencies = [ "anstream", "anstyle", @@ -379,9 +379,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.32" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" dependencies = [ "heck", "proc-macro2", @@ -391,9 +391,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "clippy-tracing" @@ -419,9 +419,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "cpu-template-helper" @@ -673,25 +673,14 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.11.0+wasi-snapshot-preview1", -] - -[[package]] -name = "getrandom" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasi", ] [[package]] @@ -722,9 +711,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.3" +version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" +checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" [[package]] name = "heck" @@ -850,7 +839,7 @@ version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ - "getrandom 0.3.2", + "getrandom", "libc", ] @@ -912,7 +901,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.53.0", ] [[package]] @@ -1087,9 +1076,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" [[package]] name = "portable-atomic-util" @@ -1130,15 +1119,15 @@ dependencies = [ [[package]] name = "proptest" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50" +checksum = "6fcdab19deb5195a31cf7726a210015ff1496ba1464fd42cb4f537b8b01b471f" dependencies = [ "bitflags 2.9.1", "lazy_static", "num-traits", - "rand 0.8.5", - "rand_chacha 0.3.1", + "rand", + "rand_chacha", "rand_xorshift", "regex-syntax", "unarray", @@ -1159,35 +1148,14 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - [[package]] name = "rand" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" dependencies = [ - "rand_chacha 0.9.0", - "rand_core 0.9.3", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.4", + "rand_chacha", + "rand_core", ] [[package]] @@ -1197,16 +1165,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.3", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.16", + "rand_core", ] [[package]] @@ -1215,16 +1174,16 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.3.2", + "getrandom", ] [[package]] name = "rand_xorshift" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" dependencies = [ - "rand_core 0.6.4", + "rand_core", ] [[package]] @@ -1365,9 +1324,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.8" +version = "0.6.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" dependencies = [ "serde", ] @@ -1417,9 +1376,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.101" +version = "2.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "f6397daf94fa90f058bd0fd88429dd9e5738999cca8d701813c80723add80462" dependencies = [ "proc-macro2", "quote", @@ -1487,9 +1446,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.22" +version = "0.8.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ae329d1f08c4d17a59bed7ff5b5a769d062e64a62d34a3261b219e62cd5aae" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", "serde_spanned", @@ -1499,18 +1458,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.9" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.22.26" +version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ "indexmap", "serde", @@ -1522,9 +1481,9 @@ dependencies = [ [[package]] name = "toml_write" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb942dfe1d8e29a7ee7fcbde5bd2b9a25fb89aa70caea2eba3bee836ff41076" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" [[package]] name = "typenum" @@ -1619,9 +1578,9 @@ version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ - "getrandom 0.3.2", + "getrandom", "js-sys", - "rand 0.9.1", + "rand", "wasm-bindgen", ] @@ -1762,12 +1721,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -1884,7 +1837,7 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -1893,14 +1846,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1909,48 +1878,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.10" diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 57aeabc1648..9f659f16c8c 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -42,7 +42,10 @@ serde_json = "1.0.140" [dev-dependencies] cargo_toml = "0.22.1" libc = "0.2.172" -regex = { version = "1.11.1", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.11.1", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.219", features = ["derive"] } From 6084af828ad6a66014434f9a0164b51a12be9fa8 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:14:33 +0200 Subject: [PATCH 02/27] pci: fixes in PCI crate Define thiserror::Error and displaydoc::Display for various error types in the vended PCI crate. This way we can embed them in our error types downstream. Also export a few types and struct fields that were private and we will be needing them. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/pci/Cargo.toml | 1 + src/pci/src/bus.rs | 4 ++-- src/pci/src/configuration.rs | 6 +++--- src/pci/src/device.rs | 29 +++++------------------------ src/pci/src/lib.rs | 11 +++++++---- src/pci/src/msix.rs | 18 +++++++++++++++--- 7 files changed, 34 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ad895c46ac9..ade3eee8715 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1047,6 +1047,7 @@ name = "pci" version = "0.1.0" dependencies = [ "byteorder", + "displaydoc", "libc", "log", "serde", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index c88cd270b23..3549d5010fe 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -13,6 +13,7 @@ default = [] [dependencies] byteorder = "1.5.0" +displaydoc = "0.2.5" libc = "0.2.172" log = "0.4.27" serde = { version = "1.0.219", features = ["derive"] } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index cb42b4ee9c5..775238edff9 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -24,7 +24,7 @@ const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum PciRootError { /// Could not allocate device address space for the device. AllocateDeviceAddrs(PciDeviceError), @@ -103,7 +103,7 @@ impl PciDevice for PciRoot { pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. - devices: HashMap>>, + pub devices: HashMap>>, device_reloc: Arc, device_ids: Vec, } diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 3a53167148c..c37f8026fbe 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -409,7 +409,7 @@ struct PciBar { r#type: Option, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct PciConfigurationState { registers: Vec, writable_bits: Vec, @@ -466,7 +466,7 @@ impl From for PciBarType { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub enum PciBarPrefetchable { NotPrefetchable = 0, Prefetchable = 0x08, @@ -481,7 +481,7 @@ impl From for bool { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct PciBarConfiguration { addr: u64, size: u64, diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index d3bd3056a36..bf89331faa9 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -6,7 +6,6 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::any::Any; -use std::fmt::{self, Display}; use std::sync::{Arc, Barrier}; use std::{io, result}; @@ -16,39 +15,21 @@ use vm_device::Resource; use crate::configuration::{self, PciBarRegionType}; use crate::PciBarConfiguration; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { - /// Setup of the device capabilities failed. + /// Setup of the device capabilities failed: {0}. CapabilitiesSetup(configuration::Error), - /// Allocating space for an IO BAR failed. + /// Allocating space for an IO BAR failed, size={0}. IoAllocationFailed(u64), - /// Registering an IO BAR failed. + /// Registering an IO BAR at address {0} failed: {1} IoRegistrationFailed(u64, configuration::Error), /// Expected resource not found. MissingResource, - /// Invalid resource. + /// Invalid resource InvalidResource(Resource), } pub type Result = std::result::Result; -impl Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::Error::*; - - match self { - CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), - IoAllocationFailed(size) => { - write!(f, "failed to allocate space for an IO BAR, size={size}") - } - IoRegistrationFailed(addr, e) => { - write!(f, "failed to register an IO BAR, addr={addr} err={e}") - } - MissingResource => write!(f, "failed to find expected resource"), - InvalidResource(r) => write!(f, "invalid resource {r:?}"), - } - } -} - #[derive(Clone, Copy)] pub struct BarReprogrammingParams { pub old_base: u64, diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 2672159e474..3162da292de 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -24,15 +24,18 @@ use serde::de::Visitor; pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; pub use self::configuration::{ PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, - PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, - PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, - PCI_CONFIGURATION_ID, + PciClassCode, PciConfiguration, PciConfigurationState, PciExpressCapabilityId, PciHeaderType, + PciMassStorageSubclass, PciNetworkControllerSubclass, PciProgrammingInterface, + PciSerialBusSubClass, PciSubclass, PCI_CONFIGURATION_ID, }; pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; -pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; +pub use self::msix::{ + Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, + MSIX_TABLE_ENTRY_SIZE, +}; /// PCI has four interrupt pins A->D. #[derive(Copy, Clone)] diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs index 4b3cf688980..be5aa3b8cf1 100644 --- a/src/pci/src/msix.rs +++ b/src/pci/src/msix.rs @@ -26,7 +26,7 @@ const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; pub const MSIX_CONFIG_ID: &str = "msix_config"; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { /// Failed enabling the interrupt route. EnableInterruptRoute(io::Error), @@ -59,7 +59,7 @@ impl Default for MsixTableEntry { } } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct MsixConfigState { table_entries: Vec, pba_entries: Vec, @@ -71,11 +71,23 @@ pub struct MsixConfig { pub table_entries: Vec, pub pba_entries: Vec, pub devid: u32, - interrupt_source_group: Arc, + pub interrupt_source_group: Arc, masked: bool, enabled: bool, } +impl std::fmt::Debug for MsixConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MsixConfig") + .field("table_entries", &self.table_entries) + .field("pba_entries", &self.pba_entries) + .field("devid", &self.devid) + .field("masked", &self.masked) + .field("enabled", &self.enabled) + .finish() + } +} + impl MsixConfig { pub fn new( msix_vectors: u16, From 44fc536d30f823658a8391050b788053e16fe0e4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 10 Jun 2025 17:05:27 +0200 Subject: [PATCH 03/27] vm-device: return reference to EventFd from Interrupt trait Instead of returning an `EventFd` type, which will actually force us to clone the file descriptor in the Firecracker side. Signed-off-by: Babis Chalios --- src/vm-device/src/interrupt/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs index f4aec52a2e0..da5d87a4e1a 100644 --- a/src/vm-device/src/interrupt/mod.rs +++ b/src/vm-device/src/interrupt/mod.rs @@ -172,7 +172,7 @@ pub trait InterruptSourceGroup: Send + Sync { /// to inject interrupts into a guest, by writing to the file returned /// by this method. #[allow(unused_variables)] - fn notifier(&self, index: InterruptIndex) -> Option; + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd>; /// Update the interrupt source group configuration. /// From a6cd1a9be53ccd8f75374b39a7f5ced22b9b5653 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 25 Jun 2025 12:57:06 +0200 Subject: [PATCH 04/27] cleanup: remove unused code from pci and vm-device crates This is code we are not going to use in Firecracker. Remove it, so we can keep the crates we vend as minimal as possible, including only things we are actually using. Signed-off-by: Babis Chalios --- src/pci/src/lib.rs | 2 - src/pci/src/msi.rs | 282 --------------------------- src/vm-device/src/dma_mapping/mod.rs | 18 -- src/vm-device/src/lib.rs | 1 - 4 files changed, 303 deletions(-) delete mode 100644 src/pci/src/msi.rs delete mode 100644 src/vm-device/src/dma_mapping/mod.rs diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 3162da292de..f1dec5b126a 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -12,7 +12,6 @@ extern crate log; mod bus; mod configuration; mod device; -mod msi; mod msix; use std::fmt::{self, Debug, Display}; @@ -31,7 +30,6 @@ pub use self::configuration::{ pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; -pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; pub use self::msix::{ Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs deleted file mode 100644 index 16d593cd115..00000000000 --- a/src/pci/src/msi.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright © 2019 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -// - -use std::io; -use std::sync::Arc; - -use byteorder::{ByteOrder, LittleEndian}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use vm_device::interrupt::{ - InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, -}; - -// MSI control masks -const MSI_CTL_ENABLE: u16 = 0x1; -const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; -const MSI_CTL_64_BITS: u16 = 0x80; -const MSI_CTL_PER_VECTOR: u16 = 0x100; - -// MSI message offsets -const MSI_MSG_CTL_OFFSET: u64 = 0x2; -const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; - -// MSI message masks -const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; - -pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { - let field = (msg_ctl >> 4) & 0x7; - - if field > 5 { - return 0; - } - - 1 << field -} - -#[derive(Error, Debug)] -pub enum Error { - #[error("Failed enabling the interrupt route: {0}")] - EnableInterruptRoute(io::Error), - #[error("Failed updating the interrupt route: {0}")] - UpdateInterruptRoute(io::Error), -} - -#[derive(Clone, Copy, Default, Serialize, Deserialize)] -pub struct MsiCap { - // Message Control Register - // 0: MSI enable. - // 3-1; Multiple message capable. - // 6-4: Multiple message enable. - // 7: 64 bits address capable. - // 8: Per-vector masking capable. - // 15-9: Reserved. - pub msg_ctl: u16, - // Message Address (LSB) - // 1-0: Reserved. - // 31-2: Message address. - pub msg_addr_lo: u32, - // Message Upper Address (MSB) - // 31-0: Message address. - pub msg_addr_hi: u32, - // Message Data - // 15-0: Message data. - pub msg_data: u16, - // Mask Bits - // 31-0: Mask bits. - pub mask_bits: u32, - // Pending Bits - // 31-0: Pending bits. - pub pending_bits: u32, -} - -impl MsiCap { - fn addr_64_bits(&self) -> bool { - self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS - } - - fn per_vector_mask(&self) -> bool { - self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR - } - - fn enabled(&self) -> bool { - self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE - } - - fn num_enabled_vectors(&self) -> usize { - msi_num_enabled_vectors(self.msg_ctl) - } - - fn vector_masked(&self, vector: usize) -> bool { - if !self.per_vector_mask() { - return false; - } - - (self.mask_bits >> vector) & 0x1 == 0x1 - } - - fn size(&self) -> u64 { - let mut size: u64 = 0xa; - - if self.addr_64_bits() { - size += 0x4; - } - if self.per_vector_mask() { - size += 0xa; - } - - size - } - - fn update(&mut self, offset: u64, data: &[u8]) { - // Calculate message data offset depending on the address being 32 or - // 64 bits. - // Calculate upper address offset if the address is 64 bits. - // Calculate mask bits offset based on the address being 32 or 64 bits - // and based on the per vector masking being enabled or not. - let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = - if self.addr_64_bits() { - let mask_bits = if self.per_vector_mask() { - Some(0x10) - } else { - None - }; - (0xc, Some(0x8), mask_bits) - } else { - let mask_bits = if self.per_vector_mask() { - Some(0xc) - } else { - None - }; - (0x8, None, mask_bits) - }; - - // Update cache without overriding the read-only bits. - match data.len() { - 2 => { - let value = LittleEndian::read_u16(data); - match offset { - MSI_MSG_CTL_OFFSET => { - self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - } - x if x == msg_data_offset => self.msg_data = value, - _ => error!("invalid offset"), - } - } - 4 => { - let value = LittleEndian::read_u32(data); - match offset { - 0x0 => { - self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - } - MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, - x if x == msg_data_offset => self.msg_data = value as u16, - x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { - self.msg_addr_hi = value - } - x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { - self.mask_bits = value - } - _ => error!("invalid offset"), - } - } - _ => error!("invalid data length"), - } - } -} - -#[derive(Serialize, Deserialize)] -pub struct MsiConfigState { - cap: MsiCap, -} - -pub struct MsiConfig { - pub cap: MsiCap, - interrupt_source_group: Arc, -} - -impl MsiConfig { - pub fn new( - msg_ctl: u16, - interrupt_source_group: Arc, - state: Option, - ) -> Result { - let cap = if let Some(state) = state { - if state.cap.enabled() { - for idx in 0..state.cap.num_enabled_vectors() { - let config = MsiIrqSourceConfig { - high_addr: state.cap.msg_addr_hi, - low_addr: state.cap.msg_addr_lo, - data: state.cap.msg_data as u32, - devid: 0, - }; - - interrupt_source_group - .update( - idx as InterruptIndex, - InterruptSourceConfig::MsiIrq(config), - state.cap.vector_masked(idx), - false, - ) - .map_err(Error::UpdateInterruptRoute)?; - } - - interrupt_source_group - .set_gsi() - .map_err(Error::EnableInterruptRoute)?; - - interrupt_source_group - .enable() - .map_err(Error::EnableInterruptRoute)?; - } - - state.cap - } else { - MsiCap { - msg_ctl, - ..Default::default() - } - }; - - Ok(MsiConfig { - cap, - interrupt_source_group, - }) - } - - pub fn state(&self) -> MsiConfigState { - MsiConfigState { cap: self.cap } - } - - pub fn enabled(&self) -> bool { - self.cap.enabled() - } - - pub fn size(&self) -> u64 { - self.cap.size() - } - - pub fn num_enabled_vectors(&self) -> usize { - self.cap.num_enabled_vectors() - } - - pub fn update(&mut self, offset: u64, data: &[u8]) { - let old_enabled = self.cap.enabled(); - - self.cap.update(offset, data); - - if self.cap.enabled() { - for idx in 0..self.num_enabled_vectors() { - let config = MsiIrqSourceConfig { - high_addr: self.cap.msg_addr_hi, - low_addr: self.cap.msg_addr_lo, - data: self.cap.msg_data as u32, - devid: 0, - }; - - if let Err(e) = self.interrupt_source_group.update( - idx as InterruptIndex, - InterruptSourceConfig::MsiIrq(config), - self.cap.vector_masked(idx), - true, - ) { - error!("Failed updating vector: {:?}", e); - } - } - - if !old_enabled { - if let Err(e) = self.interrupt_source_group.enable() { - error!("Failed enabling irq_fd: {:?}", e); - } - } - } else if old_enabled { - if let Err(e) = self.interrupt_source_group.disable() { - error!("Failed disabling irq_fd: {:?}", e); - } - } - } -} diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs deleted file mode 100644 index 6cba6e16488..00000000000 --- a/src/vm-device/src/dma_mapping/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause - -/// Trait to trigger DMA mapping updates for devices managed by virtio-iommu -/// -/// Trait meant for triggering the DMA mapping update related to an external -/// device not managed fully through virtio. It is dedicated to virtio-iommu -/// in order to trigger the map update anytime the mapping is updated from the -/// guest. -pub trait ExternalDmaMapping: Send + Sync { - /// Map a memory range - fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), std::io::Error>; - - /// Unmap a memory range - fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), std::io::Error>; -} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs index fe06fd8b465..b980b09c4b9 100644 --- a/src/vm-device/src/lib.rs +++ b/src/vm-device/src/lib.rs @@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize}; mod bus; -pub mod dma_mapping; pub mod interrupt; pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; From a172a8d2c0f8e2f9ffd6c8f1d3cd28940edc17a8 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 18:46:32 +0200 Subject: [PATCH 05/27] refactor: allow storing Arc within Vmm We'd like to be able to store Vm within an atomic reference so we can pass it around and share it with other components. The main issue with doing this change is that we need Vm to be `mut` during initialization and the builder.rs code was creating Vmm with Vm embedded in it. To solve this, we break down the initialization of the Vmm object. We first create its individual parts (Vm, Kvm and DeviceManager), perform any necessary initialization logic on Vm and once this done add it within an Arc. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/mod.rs | 23 +- src/vmm/src/arch/x86_64/mod.rs | 26 ++- src/vmm/src/builder.rs | 309 +++++++++++++++----------- src/vmm/src/device_manager/acpi.rs | 4 +- src/vmm/src/device_manager/mmio.rs | 13 +- src/vmm/src/device_manager/mod.rs | 12 + src/vmm/src/device_manager/persist.rs | 1 + src/vmm/src/lib.rs | 3 +- 8 files changed, 230 insertions(+), 161 deletions(-) diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index df6e712dcf5..a599db5dea7 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -32,7 +32,7 @@ use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{DeviceManager, Kvm, Vcpu, VcpuConfig, Vm, logger}; /// Errors thrown while configuring aarch64 system. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -82,8 +82,11 @@ pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -103,11 +106,11 @@ pub fn configure_system_for_boot( cpu_config, }; - let optional_capabilities = vmm.kvm.optional_capabilities(); + let optional_capabilities = kvm.optional_capabilities(); // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu.configure( - vmm.vm.guest_memory(), + vm.guest_memory(), entry_point, &vcpu_config, &optional_capabilities, @@ -123,18 +126,16 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); let fdt = fdt::create_fdt( - vmm.vm.guest_memory(), + vm.guest_memory(), vcpu_mpidr, cmdline, - &vmm.device_manager, - vmm.vm.get_irqchip(), + device_manager, + vm.get_irqchip(), initrd, )?; - let fdt_address = GuestAddress(get_fdt_addr(vmm.vm.guest_memory())); - vmm.vm - .guest_memory() - .write_slice(fdt.as_slice(), fdt_address)?; + let fdt_address = GuestAddress(get_fdt_addr(vm.guest_memory())); + vm.guest_memory().write_slice(fdt.as_slice(), fdt_address)?; Ok(()) } diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index fe1296e5d1c..68b903d5ff6 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,6 +33,7 @@ pub mod generated; use std::fs::File; +use kvm::Kvm; use layout::{ CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, @@ -53,6 +54,7 @@ use crate::acpi::create_acpi_tables; use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; +use crate::device_manager::DeviceManager; use crate::initrd::InitrdConfig; use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; @@ -60,7 +62,7 @@ use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{Vcpu, VcpuConfig, Vm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -169,8 +171,11 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> Opti } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -179,8 +184,7 @@ pub fn configure_system_for_boot( boot_cmdline: Cmdline, ) -> Result<(), ConfigurationError> { // Construct the base CpuConfiguration to apply CPU template onto. - let cpu_config = - CpuConfiguration::new(vmm.kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; + let cpu_config = CpuConfiguration::new(kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; // Apply CPU template to the base CpuConfiguration. let cpu_config = CpuConfiguration::apply_template(cpu_config, cpu_template)?; @@ -193,7 +197,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(vmm.vm.guest_memory(), entry_point, &vcpu_config)?; + .configure(vm.guest_memory(), entry_point, &vcpu_config)?; } // Write the kernel command line to guest memory. This is x86_64 specific, since on @@ -204,7 +208,7 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); load_cmdline( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(crate::arch::x86_64::layout::CMDLINE_START), &boot_cmdline, ) @@ -212,19 +216,19 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( - vmm.vm.guest_memory(), - &vmm.device_manager.resource_allocator, + vm.guest_memory(), + &device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; match entry_point.protocol { BootProtocol::PvhBoot => { - configure_pvh(vmm.vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; + configure_pvh(vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; } BootProtocol::LinuxBoot => { configure_64bit_boot( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(CMDLINE_START), cmdline_size, initrd, @@ -234,7 +238,7 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables(vmm.vm.guest_memory(), &mut vmm.device_manager, vcpus)?; + create_acpi_tables(vm.guest_memory(), device_manager, vcpus)?; Ok(()) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 2c037fc529f..290cf000c5e 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -16,17 +16,18 @@ use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; +#[cfg(target_arch = "aarch64")] +use crate::Vcpu; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; -use crate::cpu_config::templates::{ - GetCpuTemplate, GetCpuTemplateError, GuestConfigError, KvmCapability, -}; +use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; #[cfg(target_arch = "aarch64")] use crate::device_manager::AttachLegacyMmioDeviceError; use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ - AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, + AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DeviceManagerCreateError, + DevicePersistError, DeviceRestoreArgs, }; use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; @@ -43,10 +44,10 @@ use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; -use crate::vstate::kvm::Kvm; +use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; -use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; +use crate::vstate::vcpu::VcpuError; +use crate::vstate::vm::{Vm, VmError}; use crate::{EventManager, Vmm, VmmError, device_manager}; /// Errors associated with starting the instance. @@ -61,6 +62,8 @@ pub enum StartMicrovmError { AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), + /// Failed to create device manager: {0} + CreateDeviceManager(#[from] DeviceManagerCreateError), /// Failed to create guest config: {0} CreateGuestConfig(#[from] GuestConfigError), /// Cannot create network device: {0} @@ -87,6 +90,8 @@ pub enum StartMicrovmError { GetCpuTemplate(#[from] GetCpuTemplateError), /// Invalid kernel command line: {0} KernelCmdline(String), + /// Kvm error: {0} + Kvm(#[from] KvmError), /// Cannot load command line string: {0} LoadCommandline(linux_loader::loader::Error), /// Cannot start microvm without kernel configuration. @@ -115,6 +120,8 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// Error with the Vm object: {0} + Vm(#[from] VmError), } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -125,37 +132,6 @@ impl std::convert::From for StartMicrovmError { } } -#[cfg_attr(target_arch = "aarch64", allow(unused))] -fn create_vmm_and_vcpus( - instance_info: &InstanceInfo, - event_manager: &mut EventManager, - vcpu_count: u8, - kvm_capabilities: Vec, -) -> Result<(Vmm, Vec), VmmError> { - let kvm = Kvm::new(kvm_capabilities)?; - // Set up Kvm Vm and register memory regions. - // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; - - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; - - let device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; - - let vmm = Vmm { - events_observer: Some(std::io::stdin()), - instance_info: instance_info.clone(), - shutdown_exit_code: None, - kvm, - vm, - uffd: None, - vcpus_handles: Vec::new(), - vcpus_exit_evt, - device_manager, - }; - - Ok((vmm, vcpus)) -} - /// Builds and starts a microVM based on the current Firecracker VmResources configuration. /// /// The built microVM and all the created vCPUs start off in the paused state. @@ -167,8 +143,6 @@ pub fn build_microvm_for_boot( event_manager: &mut EventManager, seccomp_filters: &BpfThreadMap, ) -> Result>, StartMicrovmError> { - use self::StartMicrovmError::*; - // Timestamp for measuring microVM boot duration. let request_ts = TimestampUs::default(); @@ -176,7 +150,7 @@ pub fn build_microvm_for_boot( .boot_source .builder .as_ref() - .ok_or(MissingKernelConfig)?; + .ok_or(StartMicrovmError::MissingKernelConfig)?; let guest_memory = vm_resources .allocate_guest_memory() @@ -191,19 +165,17 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - cpu_template.kvm_capabilities.clone(), - )?; + let kvm = Kvm::new(cpu_template.kvm_capabilities.clone())?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm)?; + let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; + vm.register_memory_regions(guest_memory)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm)?; + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; + let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); @@ -214,11 +186,11 @@ pub fn build_microvm_for_boot( #[cfg(feature = "gdb")] let vcpu_fds = vcpus .iter() - .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) + .map(|vcpu| vcpu.copy_kvm_vcpu_fd(&vm)) .collect::, _>>()?; if vm_resources.pci_enabled { - vmm.device_manager.enable_pci()?; + device_manager.enable_pci()?; } else { boot_cmdline.insert("pci", "off")?; } @@ -227,53 +199,70 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - vmm.device_manager.attach_boot_timer_device(request_ts)?; + device_manager.attach_boot_timer_device(request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { - attach_balloon_device(&mut vmm, &mut boot_cmdline, balloon, event_manager)?; + attach_balloon_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + balloon, + event_manager, + )?; } attach_block_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.block.devices.iter(), event_manager, )?; attach_net_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.net_builder.iter(), event_manager, )?; if let Some(unix_vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device(&mut vmm, &mut boot_cmdline, unix_vsock, event_manager)?; + attach_unixsock_vsock_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + unix_vsock, + event_manager, + )?; } if let Some(entropy) = vm_resources.entropy.get() { - attach_entropy_device(&mut vmm, &mut boot_cmdline, entropy, event_manager)?; + attach_entropy_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + entropy, + event_manager, + )?; } #[cfg(target_arch = "aarch64")] - vmm.device_manager.attach_legacy_devices_aarch64( - vmm.vm.fd(), - event_manager, - &mut boot_cmdline, - )?; + device_manager.attach_legacy_devices_aarch64(vm.fd(), event_manager, &mut boot_cmdline)?; - vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd())?; + device_manager.attach_vmgenid_device(vm.guest_memory(), vm.fd())?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut vmm, &mut vcpus)?; + setup_pvtime(&mut device_manager, &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } configure_system_for_boot( - &mut vmm, + &kvm, + &vm, + &mut device_manager, vcpus.as_mut(), &vm_resources.machine_config, &cpu_template, @@ -282,6 +271,18 @@ pub fn build_microvm_for_boot( boot_cmdline, )?; + let vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm: Arc::new(vm), + uffd: None, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; + let vmm = Arc::new(Mutex::new(vmm)); #[cfg(feature = "gdb")] @@ -293,7 +294,7 @@ pub fn build_microvm_for_boot( entry_point.entry_addr, gdb_socket_path, ) - .map_err(GdbServer)?; + .map_err(StartMicrovmError::GdbServer)?; } else { debug!("No GDB socket provided not starting gdb server."); } @@ -305,7 +306,7 @@ pub fn build_microvm_for_boot( vcpus, seccomp_filters .get("vcpu") - .ok_or_else(|| MissingSeccompFilters("vcpu".to_string()))? + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vcpu".to_string()))? .clone(), ) .map_err(VmmError::VcpuStart)?; @@ -317,7 +318,7 @@ pub fn build_microvm_for_boot( crate::seccomp::apply_filter( seccomp_filters .get("vmm") - .ok_or_else(|| MissingSeccompFilters("vmm".to_string()))?, + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vmm".to_string()))?, ) .map_err(VmmError::SeccompFilters)?; @@ -402,19 +403,21 @@ pub fn build_microvm_from_snapshot( ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - microvm_state.kvm_state.kvm_cap_modifiers.clone(), - ) - .map_err(StartMicrovmError::Internal)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm) - .map_err(StartMicrovmError::Internal)?; - vmm.uffd = uffd; + let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone()) + .map_err(StartMicrovmError::Kvm)?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm).map_err(StartMicrovmError::Vm)?; + + let (mut vcpus, vcpus_exit_evt) = vm + .create_vcpus(vm_resources.machine_config.vcpu_count) + .map_err(StartMicrovmError::Vm)?; + + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd()).unwrap(); + + vm.register_memory_regions(guest_memory) + .map_err(StartMicrovmError::Vm)?; #[cfg(target_arch = "x86_64")] { @@ -434,7 +437,7 @@ pub fn build_microvm_from_snapshot( #[cfg(target_arch = "aarch64")] if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { allocate_pvtime_region( - &mut vmm, + &mut device_manager, vcpus.len(), vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), )?; @@ -452,28 +455,39 @@ pub fn build_microvm_from_snapshot( { let mpidrs = construct_kvm_mpidrs(µvm_state.vcpu_states); // Restore kvm vm state. - vmm.vm.restore_state(&mpidrs, µvm_state.vm_state)?; + vm.restore_state(&mpidrs, µvm_state.vm_state)?; } // Restore kvm vm state. #[cfg(target_arch = "x86_64")] - vmm.vm.restore_state(µvm_state.vm_state)?; + vm.restore_state(µvm_state.vm_state)?; // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. let device_ctor_args = DeviceRestoreArgs { - mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + mem: vm.guest_memory(), + vm: vm.fd(), event_manager, vm_resources, instance_id: &instance_info.id, - restored_from_file: vmm.uffd.is_none(), + restored_from_file: uffd.is_none(), }; - vmm.device_manager - .restore(µvm_state.device_states, device_ctor_args)?; + device_manager.restore(µvm_state.device_states, device_ctor_args)?; + + let mut vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm: Arc::new(vm), + uffd, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. vmm.start_vcpus( @@ -506,13 +520,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = vmm - .device_manager + let addr = device_manager .resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; @@ -521,10 +534,16 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] -fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmError> { +fn setup_pvtime( + device_manager: &mut DeviceManager, + vcpus: &mut [Vcpu], +) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region - let pvtime_mem: GuestAddress = - allocate_pvtime_region(vmm, vcpus.len(), vm_allocator::AllocPolicy::LastMatch)?; + let pvtime_mem: GuestAddress = allocate_pvtime_region( + device_manager, + vcpus.len(), + vm_allocator::AllocPolicy::LastMatch, + )?; // Register all vcpus with pvtime device for (i, vcpu) in vcpus.iter_mut().enumerate() { @@ -539,7 +558,8 @@ fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmErr } fn attach_entropy_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, @@ -551,9 +571,9 @@ fn attach_entropy_device( .to_string(); event_manager.add_subscriber(entropy_device.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, entropy_device.clone(), cmdline, @@ -562,7 +582,8 @@ fn attach_entropy_device( } fn attach_block_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -584,9 +605,9 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( }; // The device mutex mustn't be locked here otherwise it will deadlock. event_manager.add_subscriber(block.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, block.clone(), cmdline, @@ -597,7 +618,8 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( } fn attach_net_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, @@ -606,9 +628,9 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( let id = net_device.lock().expect("Poisoned lock").id().clone(); event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, net_device.clone(), cmdline, @@ -619,7 +641,8 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( } fn attach_unixsock_vsock_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, @@ -627,9 +650,9 @@ fn attach_unixsock_vsock_device( let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, unix_vsock.clone(), cmdline, @@ -638,7 +661,8 @@ fn attach_unixsock_vsock_device( } fn attach_balloon_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, @@ -646,9 +670,9 @@ fn attach_balloon_device( let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, balloon.clone(), cmdline, @@ -743,7 +767,7 @@ pub(crate) mod tests { instance_info: InstanceInfo::default(), shutdown_exit_code: None, kvm, - vm, + vm: Arc::new(vm), uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, @@ -788,7 +812,8 @@ pub(crate) mod tests { } attach_block_devices( - vmm, + &mut vmm.device_manager, + &vmm.vm, cmdline, block_dev_configs.devices.iter(), event_manager, @@ -806,7 +831,13 @@ pub(crate) mod tests { let mut net_builder = NetBuilder::new(); net_builder.build(net_config).unwrap(); - let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager); + let res = attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ); res.unwrap(); } @@ -827,7 +858,14 @@ pub(crate) mod tests { Arc::new(Mutex::new(mmds)), ); - attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager).unwrap(); + attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ) + .unwrap(); } pub(crate) fn insert_vsock_device( @@ -840,7 +878,14 @@ pub(crate) mod tests { let vsock = VsockBuilder::create_unixsock_vsock(vsock_config).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); + attach_unixsock_vsock_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &vsock, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager @@ -859,7 +904,14 @@ pub(crate) mod tests { let mut builder = EntropyDeviceBuilder::new(); let entropy = builder.build(entropy_config).unwrap(); - attach_entropy_device(vmm, cmdline, &entropy, event_manager).unwrap(); + attach_entropy_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &entropy, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager @@ -887,7 +939,14 @@ pub(crate) mod tests { builder.set(balloon_config).unwrap(); let balloon = builder.get().unwrap(); - attach_balloon_device(vmm, cmdline, balloon, event_manager).unwrap(); + attach_balloon_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + balloon, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 78f1254d2fa..8a447c4c065 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -6,7 +6,7 @@ use kvm_ioctls::VmFd; use crate::devices::acpi::vmgenid::VmGenId; -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: Option, @@ -15,7 +15,7 @@ pub struct ACPIDeviceManager { impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object pub fn new() -> Self { - Self { vmgenid: None } + Default::default() } /// Attach a new VMGenID device to the microVM diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index f730dd5be0d..4e87833fa47 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -128,7 +128,7 @@ pub struct MMIODevice { } /// Manages the complexities of registering a MMIO device. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct MMIODeviceManager { /// VirtIO devices using an MMIO transport layer pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, @@ -154,16 +154,7 @@ pub struct MMIODeviceManager { impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { - MMIODeviceManager { - virtio_devices: HashMap::new(), - boot_timer: None, - #[cfg(target_arch = "aarch64")] - rtc: None, - #[cfg(target_arch = "aarch64")] - serial: None, - #[cfg(target_arch = "x86_64")] - dsdt_data: vec![], - } + Default::default() } /// Allocates resources for a new device to be added. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 2922060bb13..5457b22e39d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -294,6 +294,18 @@ pub struct DeviceRestoreArgs<'a> { pub restored_from_file: bool, } +impl std::fmt::Debug for DeviceRestoreArgs<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DeviceRestoreArgs") + .field("mem", &self.mem) + .field("vm", &self.vm) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } +} + impl DeviceManager { pub fn save(&self) -> DevicesState { DevicesState { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index e3c7d2a8475..f267212ba2e 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -240,6 +240,7 @@ pub struct ACPIDeviceManagerState { vmgenid: Option, } +#[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a ResourceAllocator, diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 01ef9547d82..18177367ada 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -299,8 +299,9 @@ pub struct Vmm { // Guest VM core resources. kvm: Kvm, /// VM object - pub vm: Vm, + pub vm: Arc, // Save UFFD in order to keep it open in the Firecracker process, as well. + #[allow(unused)] uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. From d7b8d8f79381d7904adc2dfbb6ae5c807c7eea02 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 19:15:23 +0200 Subject: [PATCH 06/27] vm: track device interrupts within Vm object Add logic to track the device interrupts used by the microVM. This is not strictly needed right now, but we will need it when adding support for MSI-X interrupts. MSI-X interrupts are configured at runtime and we need to interact with KVM to set the interruput routes. To do it, we need to keep track all of the interrupts the VM is using. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 27 ++++++----- src/vmm/src/builder.rs | 57 +++++------------------ src/vmm/src/device_manager/acpi.rs | 10 ++-- src/vmm/src/device_manager/legacy.rs | 15 +++--- src/vmm/src/device_manager/mmio.rs | 28 +++++------ src/vmm/src/device_manager/mod.rs | 29 ++++++------ src/vmm/src/device_manager/persist.rs | 9 ++-- src/vmm/src/vstate/vm.rs | 67 ++++++++++++++++++++++++++- 8 files changed, 131 insertions(+), 111 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 8e67a50bd64..a2a4992eb29 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -499,17 +499,16 @@ mod tests { use std::ffi::CString; use std::sync::{Arc, Mutex}; - use kvm_ioctls::Kvm; use linux_loader::cmdline as kernel_cmdline; use super::*; - use crate::EventManager; use crate::arch::aarch64::gic::create_gic; use crate::arch::aarch64::layout; use crate::device_manager::mmio::tests::DummyDevice; use crate::device_manager::tests::default_device_manager; use crate::test_utils::arch_mem; use crate::vstate::memory::GuestAddress; + use crate::{EventManager, Kvm, Vm}; // The `load` function from the `device_tree` will mistakenly check the actual size // of the buffer with the allocated size. This works around that. @@ -525,9 +524,9 @@ mod tests { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut event_manager = EventManager::new().unwrap(); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -562,9 +561,9 @@ mod tests { fn test_create_fdt_with_vmgenid() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -585,9 +584,9 @@ mod tests { fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_GICv3.dtb"), @@ -642,9 +641,9 @@ mod tests { fn test_create_fdt_with_initrd() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_initrd_GICv3.dtb"), diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 290cf000c5e..b0712abc3a5 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -172,7 +172,7 @@ pub fn build_microvm_for_boot( let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; vm.register_memory_regions(guest_memory)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; @@ -248,9 +248,9 @@ pub fn build_microvm_for_boot( } #[cfg(target_arch = "aarch64")] - device_manager.attach_legacy_devices_aarch64(vm.fd(), event_manager, &mut boot_cmdline)?; + device_manager.attach_legacy_devices_aarch64(&vm, event_manager, &mut boot_cmdline)?; - device_manager.attach_vmgenid_device(vm.guest_memory(), vm.fd())?; + device_manager.attach_vmgenid_device(vm.guest_memory(), &vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -414,7 +414,7 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd()).unwrap(); + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm).unwrap(); vm.register_memory_regions(guest_memory) .map_err(StartMicrovmError::Vm)?; @@ -468,7 +468,7 @@ pub fn build_microvm_from_snapshot( // Restore devices states. let device_ctor_args = DeviceRestoreArgs { mem: vm.guest_memory(), - vm: vm.fd(), + vm: &vm, event_manager, vm_resources, instance_id: &instance_info.id, @@ -571,14 +571,7 @@ fn attach_entropy_device( .to_string(); event_manager.add_subscriber(entropy_device.clone()); - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - entropy_device.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, entropy_device.clone(), cmdline, false) } fn attach_block_devices<'a, I: Iterator>> + Debug>( @@ -605,14 +598,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( }; // The device mutex mustn't be locked here otherwise it will deadlock. event_manager.add_subscriber(block.clone()); - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - block.clone(), - cmdline, - is_vhost_user, - )?; + device_manager.attach_virtio_device(vm, id, block.clone(), cmdline, is_vhost_user)?; } Ok(()) } @@ -628,14 +614,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( let id = net_device.lock().expect("Poisoned lock").id().clone(); event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - net_device.clone(), - cmdline, - false, - )?; + device_manager.attach_virtio_device(vm, id, net_device.clone(), cmdline, false)?; } Ok(()) } @@ -650,14 +629,7 @@ fn attach_unixsock_vsock_device( let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - unix_vsock.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( @@ -670,14 +642,7 @@ fn attach_balloon_device( let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - balloon.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, balloon.clone(), cmdline, false) } #[cfg(test)] @@ -924,7 +889,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd()) + .attach_vmgenid_device(vmm.vm.guest_memory(), &vmm.vm) .unwrap(); assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); } diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 8a447c4c065..3f0af80c7aa 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; +use crate::Vm; use crate::devices::acpi::vmgenid::VmGenId; #[derive(Debug, Default)] @@ -21,12 +21,8 @@ impl ACPIDeviceManager { /// Attach a new VMGenID device to the microVM /// /// This will register the device's interrupt with KVM - pub fn attach_vmgenid( - &mut self, - vmgenid: VmGenId, - vm_fd: &VmFd, - ) -> Result<(), kvm_ioctls::Error> { - vm_fd.register_irqfd(&vmgenid.interrupt_evt, vmgenid.gsi)?; + pub fn attach_vmgenid(&mut self, vmgenid: VmGenId, vm: &Vm) -> Result<(), kvm_ioctls::Error> { + vm.register_irq(&vmgenid.interrupt_evt, vmgenid.gsi)?; self.vmgenid = Some(vmgenid); Ok(()) } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index cedb7abc32c..7011ae71122 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -11,11 +11,11 @@ use std::sync::{Arc, Mutex}; use acpi_tables::aml::AmlError; use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; use libc::EFD_NONBLOCK; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; +use crate::Vm; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{EventFdTrigger, I8042Device, SerialDevice, SerialEventsWrapper}; @@ -100,7 +100,7 @@ impl PortIODeviceManager { pub fn register_devices( &mut self, io_bus: &vm_device::Bus, - vm_fd: &VmFd, + vm: &Vm, ) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( @@ -148,18 +148,15 @@ impl PortIODeviceManager { Self::I8042_KDB_DATA_REGISTER_SIZE, )?; - vm_fd - .register_irqfd(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) + vm.register_irq(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) + vm.register_irq(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.kbd_evt, Self::KBD_EVT_GSI) + vm.register_irq(&self.kbd_evt, Self::KBD_EVT_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; @@ -264,6 +261,6 @@ mod tests { )), ) .unwrap(); - ldm.register_devices(&io_bus, vm.fd()).unwrap(); + ldm.register_devices(&io_bus, &vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 4e87833fa47..2d4432470d1 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -12,7 +12,7 @@ use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; -use kvm_ioctls::{IoEventAddress, VmFd}; +use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; @@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; +use crate::Vm; use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; @@ -184,7 +185,7 @@ impl MMIODeviceManager { /// Register a virtio-over-MMIO device to be used via MMIO transport at a specific slot. pub fn register_mmio_virtio( &mut self, - vm: &VmFd, + vm: &Vm, device_id: String, mmio_bus: &vm_device::Bus, device: MMIODevice, @@ -201,10 +202,11 @@ impl MMIODeviceManager { let io_addr = IoEventAddress::Mmio( device.resources.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), ); - vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) + vm.fd() + .register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irqfd(&mmio_device.interrupt.irq_evt, irq.get()) + vm.register_irq(&mmio_device.interrupt.irq_evt, irq.get()) .map_err(MmioError::RegisterIrqFd)?; } @@ -243,7 +245,7 @@ impl MMIODeviceManager { /// to the boot cmdline. pub fn register_mmio_virtio_for_boot( &mut self, - vm: &VmFd, + vm: &Vm, resource_allocator: &ResourceAllocator, device_id: String, mmio_device: MmioTransport, @@ -275,7 +277,7 @@ impl MMIODeviceManager { /// otherwise allocate a new MMIO resources for it. pub fn register_mmio_serial( &mut self, - vm: &VmFd, + vm: &Vm, resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, @@ -293,7 +295,7 @@ impl MMIODeviceManager { } }; - vm.register_irqfd( + vm.register_irq( serial.lock().expect("Poisoned lock").serial.interrupt_evt(), device_info.irq.unwrap().get(), ) @@ -557,7 +559,7 @@ pub(crate) mod tests { impl MMIODeviceManager { pub(crate) fn register_virtio_test_device( &mut self, - vm: &VmFd, + vm: &Vm, guest_mem: GuestMemoryMmap, resource_allocator: &ResourceAllocator, device: Arc>, @@ -690,7 +692,7 @@ pub(crate) mod tests { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy, @@ -744,7 +746,7 @@ pub(crate) mod tests { for _i in crate::arch::IRQ_BASE..=crate::arch::IRQ_MAX { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), @@ -758,7 +760,7 @@ pub(crate) mod tests { "{}", device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), @@ -803,7 +805,7 @@ pub(crate) mod tests { let id = String::from("foo"); let addr = device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy, @@ -834,7 +836,7 @@ pub(crate) mod tests { let id2 = String::from("foo2"); device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy2, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 5457b22e39d..a60a86ea7c3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -10,7 +10,6 @@ use std::sync::{Arc, Mutex}; use acpi::ACPIDeviceManager; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; #[cfg(target_arch = "x86_64")] use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; @@ -36,7 +35,7 @@ use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; -use crate::{EmulateSerialInitError, EventManager}; +use crate::{EmulateSerialInitError, EventManager, Vm}; /// ACPI device manager. pub mod acpi; @@ -143,7 +142,7 @@ impl DeviceManager { pub fn new( event_manager: &mut EventManager, vcpu_exit_evt: &EventFd, - vmfd: &VmFd, + vm: &Vm, ) -> Result { let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] @@ -160,7 +159,7 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vmfd)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; legacy_devices }; @@ -177,8 +176,7 @@ impl DeviceManager { /// Attaches a VirtioDevice device to the device manager and event manager. pub(crate) fn attach_virtio_device( &mut self, - mem: &GuestMemoryMmap, - vmfd: &VmFd, + vm: &Vm, id: String, device: Arc>, cmdline: &mut Cmdline, @@ -186,9 +184,10 @@ impl DeviceManager { ) -> Result<(), AttachMmioDeviceError> { let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); + let device = + MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); self.mmio_devices.register_mmio_virtio_for_boot( - vmfd, + vm, &self.resource_allocator, id, device, @@ -214,17 +213,17 @@ impl DeviceManager { pub(crate) fn attach_vmgenid_device( &mut self, mem: &GuestMemoryMmap, - vmfd: &VmFd, + vm: &Vm, ) -> Result<(), AttachVmgenidError> { let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; - self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; + self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } #[cfg(target_arch = "aarch64")] pub(crate) fn attach_legacy_devices_aarch64( &mut self, - vmfd: &VmFd, + vm: &Vm, event_manager: &mut EventManager, cmdline: &mut Cmdline, ) -> Result<(), AttachLegacyMmioDeviceError> { @@ -241,7 +240,7 @@ impl DeviceManager { Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; self.mmio_devices - .register_mmio_serial(vmfd, &self.resource_allocator, serial, None)?; + .register_mmio_serial(vm, &self.resource_allocator, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } @@ -287,7 +286,7 @@ pub enum DevicePersistError { pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, @@ -434,7 +433,7 @@ pub(crate) mod tests { let mut cmdline = Cmdline::new(4096).unwrap(); let mut event_manager = EventManager::new().unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_none()); @@ -442,7 +441,7 @@ pub(crate) mod tests { let mut vmm = default_vmm(); cmdline.insert("console", "/dev/blah").unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_some()); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index f267212ba2e..6b1168ec965 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -7,7 +7,6 @@ use std::fmt::{self, Debug}; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; use log::{error, warn}; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; @@ -15,7 +14,6 @@ use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; use super::resources::ResourceAllocator; -use crate::EventManager; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; @@ -51,6 +49,7 @@ use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; use crate::vmm_config::mmds::MmdsConfigError; use crate::vstate::memory::GuestMemoryMmap; +use crate::{EventManager, Vm}; /// Errors for (de)serialization of the MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -215,7 +214,7 @@ pub enum SharedDeviceType { pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, @@ -244,7 +243,7 @@ pub struct ACPIDeviceManagerState { pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a ResourceAllocator, - pub vm: &'a VmFd, + pub vm: &'a Vm, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -817,7 +816,7 @@ mod tests { let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + vm: &vmm.vm, event_manager: &mut event_manager, resource_allocator: &resource_allocator, vm_resources, diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 7a8965a4b9a..cf8879df033 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -9,10 +9,16 @@ use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_IRQCHIP_IOAPIC; +use kvm_bindings::{ + KVM_IRQ_ROUTING_IRQCHIP, KVM_MEM_LOG_DIRTY_PAGES, kvm_irq_routing_entry, + kvm_userspace_memory_region, +}; use kvm_ioctls::VmFd; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; @@ -26,6 +32,26 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Errors related with Firecracker interrupts +pub enum InterruptError { + /// Error allocating resources: {0} + Allocator(#[from] vm_allocator::Error), + /// EventFd error: {0} + EventFd(std::io::Error), + /// FamStruct error: {0} + FamStruct(#[from] vmm_sys_util::fam::Error), + /// KVM error: {0} + Kvm(#[from] kvm_ioctls::Error), +} + +#[derive(Debug)] +/// A struct representing an interrupt line used by some device of the microVM +pub struct RoutingEntry { + entry: kvm_irq_routing_entry, + masked: bool, +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -34,6 +60,8 @@ pub struct VmCommon { max_memslots: usize, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + /// Interrupts used by Vm's devices + pub interrupts: Mutex>, } /// Errors associated with the wrappers over KVM ioctls. @@ -101,6 +129,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + interrupts: Mutex::new(HashMap::new()), }) } @@ -276,6 +305,40 @@ impl Vm { file.sync_all() .map_err(|err| MemoryBackingFile("sync_all", err)) } + + /// Register a device IRQ + pub fn register_irq(&self, fd: &EventFd, gsi: u32) -> Result<(), errno::Error> { + self.common.fd.register_irqfd(fd, gsi)?; + + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + #[cfg(target_arch = "x86_64")] + { + entry.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC; + } + #[cfg(target_arch = "aarch64")] + { + entry.u.irqchip.irqchip = 0; + } + + entry.u.irqchip.pin = gsi; + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert( + gsi, + RoutingEntry { + entry, + masked: false, + }, + ); + Ok(()) + } } #[cfg(test)] From 0ed1d5b8dc201e2d52aad82952e62cf301670beb Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 12:28:39 +0200 Subject: [PATCH 07/27] interrupts: add support for MSI/MSI-X interrupts Enable Vm to vend and manage MSI/MSI-X interrupts. This adds the logic to create a set of MSI vectors and then handle their lifetime. Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 442 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 439 insertions(+), 3 deletions(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index cf8879df033..47c3011f37d 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -9,19 +9,25 @@ use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; use kvm_bindings::{ - KVM_IRQ_ROUTING_IRQCHIP, KVM_MEM_LOG_DIRTY_PAGES, kvm_irq_routing_entry, - kvm_userspace_memory_region, + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MSI_VALID_DEVID, + KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, }; use kvm_ioctls::VmFd; +use log::debug; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::utils::u64_to_usize; @@ -52,6 +58,148 @@ pub struct RoutingEntry { masked: bool, } +/// Type that describes an allocated interrupt +#[derive(Debug)] +pub struct MsiVector { + /// GSI used for this vector + pub gsi: u32, + /// EventFd used for this vector + pub event_fd: EventFd, + /// Flag determining whether the vector is enabled + pub enabled: AtomicBool, +} + +impl MsiVector { + /// Create a new [`MsiVector`] of a particular type + pub fn new(gsi: u32, enabled: bool) -> Result { + Ok(MsiVector { + gsi, + event_fd: EventFd::new(libc::EFD_NONBLOCK).map_err(InterruptError::EventFd)?, + enabled: AtomicBool::new(enabled), + }) + } +} + +impl MsiVector { + /// Enable vector + fn enable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if !self.enabled.load(Ordering::Acquire) { + vmfd.register_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(true, Ordering::Release); + } + + Ok(()) + } + + /// Disable vector + fn disable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if self.enabled.load(Ordering::Acquire) { + vmfd.unregister_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(false, Ordering::Release); + } + + Ok(()) + } +} + +#[derive(Debug)] +/// MSI interrupts created for a VirtIO device +pub struct MsiVectorGroup { + vm: Arc, + irq_routes: HashMap, +} + +impl MsiVectorGroup { + /// Returns the number of vectors in this group + pub fn num_vectors(&self) -> u16 { + // It is safe to unwrap here. We are creating `MsiVectorGroup` objects through the + // `Vm::create_msix_group` where the argument for the number of `irq_routes` is a `u16`. + u16::try_from(self.irq_routes.len()).unwrap() + } +} + +impl InterruptSourceGroup for MsiVectorGroup { + fn enable(&self) -> vm_device::interrupt::Result<()> { + for route in self.irq_routes.values() { + route.enable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn disable(&self) -> vm_device::interrupt::Result<()> { + for route in self.irq_routes.values() { + route.disable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { + self.notifier(index) + .ok_or(std::io::Error::other(format!( + "trigger: invalid interrupt index {index}" + )))? + .write(1) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + self.irq_routes.get(&index).map(|route| &route.event_fd) + } + + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> vm_device::interrupt::Result<()> { + let msi_config = match config { + InterruptSourceConfig::LegacyIrq(_) => { + return Err(std::io::Error::other( + "MSI-x update: invalid configuration type", + )); + } + InterruptSourceConfig::MsiIrq(config) => config, + }; + + if let Some(route) = self.irq_routes.get(&index) { + // When an interrupt is masked the GSI will not be passed to KVM through + // KVM_SET_GSI_ROUTING. So, call [`disable()`] to unregister the interrupt file + // descriptor before passing the interrupt routes to KVM + if masked { + route.disable(&self.vm.common.fd)?; + } + + self.vm.register_msi(route, masked, msi_config)?; + if set_gsi { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}")))? + } + + // Assign KVM_IRQFD after KVM_SET_GSI_ROUTING to avoid + // panic on kernel which does not have commit a80ced6ea514 + // (KVM: SVM: fix panic on out-of-bounds guest IRQ). + if !masked { + route.enable(&self.vm.common.fd)?; + } + + return Ok(()); + } + + Err(std::io::Error::other(format!( + "MSI-X update: invalid vector index {index}" + ))) + } + + fn set_gsi(&self) -> vm_device::interrupt::Result<()> { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}"))) + } +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -323,7 +471,6 @@ impl Vm { { entry.u.irqchip.irqchip = 0; } - entry.u.irqchip.pin = gsi; self.common @@ -339,10 +486,89 @@ impl Vm { ); Ok(()) } + + /// Register an MSI device interrupt + pub fn register_msi( + &self, + route: &MsiVector, + masked: bool, + config: MsiIrqSourceConfig, + ) -> Result<(), errno::Error> { + let mut entry = kvm_irq_routing_entry { + gsi: route.gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + entry.u.msi.address_lo = config.low_addr; + entry.u.msi.address_hi = config.high_addr; + entry.u.msi.data = config.data; + + if self.common.fd.check_extension(kvm_ioctls::Cap::MsiDevid) { + // According to KVM documentation: + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-gsi-routing + // + // if the capability is set, we need to set the flag and provide a valid unique device + // ID. "For PCI, this is usually a BDF identifier in the lower 16 bits". + // + // The layout of `config.devid` is: + // + // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| + // | segment | bus | device | function | + // + // For the time being, we are using a single PCI segment and a single bus per segment + // so just passing config.devid should be fine. + entry.flags = KVM_MSI_VALID_DEVID; + entry.u.msi.__bindgen_anon_1.devid = config.devid; + } + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert(route.gsi, RoutingEntry { entry, masked }); + + Ok(()) + } + + /// Create a group of MSI-X interrupts + pub fn create_msix_group( + vm: Arc, + resource_allocator: &ResourceAllocator, + count: u16, + ) -> Result { + debug!("Creating new MSI group with {count} vectors"); + let mut irq_routes = HashMap::with_capacity(count as usize); + for (gsi, i) in resource_allocator + .allocate_gsi(count as u32)? + .iter() + .zip(0u32..) + { + irq_routes.insert(i, MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { vm, irq_routes }) + } + + /// Set GSI routes to KVM + pub fn set_gsi_routes(&self) -> Result<(), InterruptError> { + let entries = self.common.interrupts.lock().expect("Poisoned lock"); + let mut routes = KvmIrqRouting::new(0)?; + + for entry in entries.values() { + if entry.masked { + continue; + } + routes.push(entry.entry)?; + } + + self.common.fd.set_gsi_routing(&routes)?; + Ok(()) + } } #[cfg(test)] pub(crate) mod tests { + use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; use vm_memory::GuestAddress; use vm_memory::mmap::MmapRegionBuilder; @@ -454,4 +680,214 @@ pub(crate) mod tests { assert_eq!(vcpu_vec.len(), vcpu_count as usize); } + + fn enable_irqchip(vm: &mut Vm) { + #[cfg(target_arch = "x86_64")] + vm.setup_irqchip().unwrap(); + #[cfg(target_arch = "aarch64")] + vm.setup_irqchip(1).unwrap(); + } + + fn create_msix_group(vm: &Arc) -> MsiVectorGroup { + let resource_allocator = ResourceAllocator::new().unwrap(); + Vm::create_msix_group(vm.clone(), &resource_allocator, 4).unwrap() + } + + #[test] + fn test_msi_vector_group_new() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + assert_eq!(msix_group.num_vectors(), 4); + } + + #[test] + fn test_msi_vector_group_enable_disable() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // Initially all vectors are disabled + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + + // Enable works + msix_group.enable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(route.enabled.load(Ordering::Acquire)); + } + // Enabling an enabled group doesn't error out + msix_group.enable().unwrap(); + + // Disable works + msix_group.disable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + // Disabling a disabled group doesn't error out + } + + #[test] + fn test_msi_vector_group_trigger() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // We can now trigger all vectors + for i in 0..4 { + msix_group.trigger(i).unwrap() + } + + // We can't trigger an invalid vector + msix_group.trigger(4).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_notifier() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + for i in 0..4 { + assert!(msix_group.notifier(i).is_some()); + } + + assert!(msix_group.notifier(4).is_none()); + } + + #[test] + fn test_msi_vector_group_update_wrong_config() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let irq_config = LegacyIrqSourceConfig { irqchip: 0, pin: 0 }; + msix_group + .update(0, InterruptSourceConfig::LegacyIrq(irq_config), true, true) + .unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update_invalid_vector() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let config = InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x12, + data: 0x12, + devid: 0xafa, + }); + msix_group.update(0, config, true, true).unwrap(); + msix_group.update(4, config, true, true).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + assert!(vm.common.interrupts.lock().unwrap().is_empty()); + let msix_group = create_msix_group(&vm); + + // Set some configuration for the vectors. Initially all are masked + let mut config = MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x13, + data: 0x12, + devid: 0xafa, + }; + for i in 0..4 { + config.data = 0x12 * i; + msix_group + .update(i, InterruptSourceConfig::MsiIrq(config), true, false) + .unwrap(); + } + + // All vectors should be disabled + for vector in msix_group.irq_routes.values() { + assert!(!vector.enabled.load(Ordering::Acquire)); + } + + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Simply enabling the vectors should not update the registered IRQ routes + msix_group.enable().unwrap(); + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Updating the config of a vector should enable its route (and only its route) + config.data = 0; + msix_group + .update(0, InterruptSourceConfig::MsiIrq(config), false, true) + .unwrap(); + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert_eq!(kvm_route.masked, i != 0); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_msi_vector_group_set_gsi_without_ioapic() { + // Setting GSI routes without IOAPIC setup should fail on x86. Apparently, it doesn't fail + // on Aarch64 + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let err = msix_group.set_gsi().unwrap_err(); + assert_eq!( + format!("{err}"), + "MSI-X update: KVM error: Invalid argument (os error 22)" + ); + } + + #[test] + fn test_msi_vector_group_set_gsi() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.set_gsi().unwrap(); + } } From 3b91be4dca81cb1f8b23c5e30c2cd4374815f1cd Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 10 Jun 2025 18:46:23 +0200 Subject: [PATCH 08/27] vstate: support serializing interrupts to snapshots Vm object is now maintaining information about the interrupts (both traditional IRQs and MSI-X vectors) that are being used by microVM devices. Derive Serialize/Deserialize add logic for recreating objects for relevant types. Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 58 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 47c3011f37d..950bcac652d 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -20,6 +20,7 @@ use kvm_bindings::{ }; use kvm_ioctls::VmFd; use log::debug; +use serde::{Deserialize, Serialize}; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, }; @@ -30,6 +31,7 @@ pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ @@ -51,7 +53,7 @@ pub enum InterruptError { Kvm(#[from] kvm_ioctls::Error), } -#[derive(Debug)] +#[derive(Debug, Serialize, Deserialize)] /// A struct representing an interrupt line used by some device of the microVM pub struct RoutingEntry { entry: kvm_irq_routing_entry, @@ -118,6 +120,38 @@ impl MsiVectorGroup { } } +impl<'a> Persist<'a> for MsiVectorGroup { + type State = HashMap; + type ConstructorArgs = Arc; + type Error = InterruptError; + + fn save(&self) -> Self::State { + // We don't save the "enabled" state of the MSI interrupt. PCI devices store the MSI-X + // configuration and make sure that the vector is enabled during the restore path if it was + // initially enabled + self.irq_routes + .iter() + .map(|(id, route)| (*id, route.gsi)) + .collect() + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut irq_routes = HashMap::new(); + + for (id, gsi) in state { + irq_routes.insert(*id, MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { + vm: constructor_args, + irq_routes, + }) + } +} + impl InterruptSourceGroup for MsiVectorGroup { fn enable(&self) -> vm_device::interrupt::Result<()> { for route in self.irq_routes.values() { @@ -890,4 +924,26 @@ pub(crate) mod tests { msix_group.set_gsi().unwrap(); } + + #[test] + fn test_msi_vector_group_persistence() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.enable().unwrap(); + let state = msix_group.save(); + let restored_group = MsiVectorGroup::restore(vm, &state).unwrap(); + + assert_eq!(msix_group.num_vectors(), restored_group.num_vectors()); + // Even if an MSI group is enabled, we don't save it as such. During restoration, the PCI + // transport will make sure the correct config is set for the vectors and enable them + // accordingly. + for (id, vector) in msix_group.irq_routes { + let new_vector = restored_group.irq_routes.get(&id).unwrap(); + assert_eq!(vector.gsi, new_vector.gsi); + assert!(!new_vector.enabled.load(Ordering::Acquire)); + } + } } From 7e9bbdc2fd3deebfefa9df5acffbe689f24377ac Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:08:21 +0200 Subject: [PATCH 09/27] fix(block): use correct index for interrupt Commit d8c27140 (refactor: use VirtioInterrupt in VirtIO devices) which refactored devices to use new VirtioInterrupt type introduced a bug with the index used to trigger a queue interrupt. Instead of using the actual queue index, we were using the index of the used descriptor. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/block/virtio/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index aa28a325e1c..bcfea7b6676 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -397,7 +397,7 @@ impl VirtioBlock { if queue.prepare_kick() { interrupt - .trigger(VirtioInterruptType::Queue(index)) + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { block_metrics.event_fails.inc(); }); From b8aa9e150e4572b07036241fee94e2bec2903426 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:10:50 +0200 Subject: [PATCH 10/27] virtio: initialize queue size with max_size Apparently, PCI needs Queue::size to be initialized to the maximum possible size supported by the device, otherwise initialization fails. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/queue.rs | 2 +- src/vmm/src/devices/virtio/transport/mmio.rs | 2 +- src/vmm/src/devices/virtio/vhost_user.rs | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 686d3ee3da3..53174c0891b 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -262,7 +262,7 @@ impl Queue { pub fn new(max_size: u16) -> Queue { Queue { max_size, - size: 0, + size: max_size, ready: false, desc_table_address: GuestAddress(0), avail_ring_address: GuestAddress(0), diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 9871cb0ed6e..54837694ed4 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -798,7 +798,7 @@ pub(crate) mod tests { assert_eq!(d.queue_select, 3); d.queue_select = 0; - assert_eq!(d.locked_device().queues()[0].size, 0); + assert_eq!(d.locked_device().queues()[0].size, 16); write_le_u32(&mut buf[..], 16); d.write(0x0, 0x38, &buf[..]); assert_eq!(d.locked_device().queues()[0].size, 16); diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 53e479ef652..4766c96edb7 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -922,10 +922,10 @@ pub(crate) mod tests { // the backend. let expected_config = VringData { index: 0, - size: 0, + size: 69, config: VringConfigData { queue_max_size: 69, - queue_size: 0, + queue_size: 69, flags: 0, desc_table_addr: guest_memory .get_host_address(queue.desc_table_address) From 1a5aafc85cb814f7d3ef096f3e1663fec76d04ed Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:13:08 +0200 Subject: [PATCH 11/27] acpi: PCI compatible flags in FADT Remove the flags in FADT that were declaring we do not support MSI and PCI ASPM. Signed-off-by: Babis Chalios --- src/vmm/src/acpi/x86_64.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/acpi/x86_64.rs b/src/vmm/src/acpi/x86_64.rs index de850a9989f..53eeac7b5e2 100644 --- a/src/vmm/src/acpi/x86_64.rs +++ b/src/vmm/src/acpi/x86_64.rs @@ -3,10 +3,7 @@ use std::mem::size_of; -use acpi_tables::fadt::{ - IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, IAPC_BOOT_ARG_FLAGS_PCI_ASPM, - IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT, -}; +use acpi_tables::fadt::IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT; use acpi_tables::madt::{IoAPIC, LocalAPIC}; use acpi_tables::{Fadt, aml}; use vm_memory::GuestAddress; @@ -33,11 +30,7 @@ pub(crate) fn setup_arch_fadt(fadt: &mut Fadt) { // neither do we support ASPM, or MSI type of interrupts. // More info here: // https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html?highlight=0a06#ia-pc-boot-architecture-flags - fadt.setup_iapc_flags( - (1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT) - | (1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM) - | (1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT), - ); + fadt.setup_iapc_flags(1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT); } #[inline(always)] From cc9e9f6a43157d5454c2a0b6d5c84b2b29b9c0d8 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 14:05:42 +0200 Subject: [PATCH 12/27] vmm: simplify device errors Merge the device-related errors that DeviceManager might return. This way, we can avoid adding yet another error type for PCI devices and reduce some the variants of StartMicrovmError. Suggested-by: Egor Lazarchuk Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 25 ++++++++++--------------- src/vmm/src/device_manager/mod.rs | 31 +++++++++---------------------- 2 files changed, 19 insertions(+), 37 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index b0712abc3a5..8bb4dff867b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -22,12 +22,12 @@ use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; -#[cfg(target_arch = "aarch64")] -use crate::device_manager::AttachLegacyMmioDeviceError; +#[cfg(target_arch = "x86_64")] +use crate::device_manager; use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ - AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DeviceManagerCreateError, - DevicePersistError, DeviceRestoreArgs, + AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError, + DeviceRestoreArgs, }; use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; @@ -48,18 +48,15 @@ use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; use crate::vstate::vcpu::VcpuError; use crate::vstate::vm::{Vm, VmError}; -use crate::{EventManager, Vmm, VmmError, device_manager}; +use crate::{EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum StartMicrovmError { /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), - /// Unable to attach the VMGenID device: {0} - AttachVmgenidDevice(#[from] AttachVmgenidError), - #[cfg(target_arch = "aarch64")] - /// Unable to attach legacy MMIO devices: {0} - AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), + /// Could not attach device: {0} + AttachDevice(#[from] AttachDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), /// Failed to create device manager: {0} @@ -104,8 +101,6 @@ pub enum StartMicrovmError { NetDeviceNotConfigured, /// Cannot open the block device backing file: {0} OpenBlockDevice(io::Error), - /// Cannot initialize a MMIO Device or add a device to the MMIO Bus or cmdline: {0} - RegisterMmioDevice(#[from] device_manager::AttachMmioDeviceError), /// Cannot restore microvm state: {0} RestoreMicrovmState(MicrovmStateError), /// Cannot set vm resources: {0} @@ -563,7 +558,7 @@ fn attach_entropy_device( cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = entropy_device .lock() .expect("Poisoned lock") @@ -625,7 +620,7 @@ fn attach_unixsock_vsock_device( cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. @@ -638,7 +633,7 @@ fn attach_balloon_device( cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index a60a86ea7c3..8df4da2863d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -64,34 +64,21 @@ pub enum DeviceManagerCreateError { #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while attaching a VirtIO device -pub enum AttachMmioDeviceError { +pub enum AttachDeviceError { /// MMIO transport error: {0} MmioTransport(#[from] MmioError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), -} - -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachVmgenidError { /// Error creating VMGenID device: {0} CreateVmGenID(#[from] VmGenIdError), /// Error while registering VMGenID with KVM: {0} AttachVmGenID(#[from] kvm_ioctls::Error), -} - -#[cfg(target_arch = "aarch64")] -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachLegacyMmioDeviceError { + #[cfg(target_arch = "aarch64")] /// Cmdline error Cmdline, + #[cfg(target_arch = "aarch64")] /// Error creating serial device: {0} CreateSerial(#[from] std::io::Error), - /// Error registering device: {0} - RegisterMMIODevice(#[from] MmioError), - /// Error inserting device in the Bus: {0} - Bus(#[from] vm_device::BusError), } #[derive(Debug)] @@ -181,7 +168,7 @@ impl DeviceManager { device: Arc>, cmdline: &mut Cmdline, is_vhost_user: bool, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. let device = @@ -201,7 +188,7 @@ impl DeviceManager { pub(crate) fn attach_boot_timer_device( &mut self, request_ts: TimestampUs, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices @@ -214,7 +201,7 @@ impl DeviceManager { &mut self, mem: &GuestMemoryMmap, vm: &Vm, - ) -> Result<(), AttachVmgenidError> { + ) -> Result<(), AttachDeviceError> { let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) @@ -226,13 +213,13 @@ impl DeviceManager { vm: &Vm, event_manager: &mut EventManager, cmdline: &mut Cmdline, - ) -> Result<(), AttachLegacyMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { // Serial device setup. let cmdline_contains_console = cmdline .as_cstring() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .into_string() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .contains("console="); if cmdline_contains_console { From ba7691c8ece8059cb0ffe36291c6e4246fd4278e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 20:48:16 +0200 Subject: [PATCH 13/27] pci: add virtio-pci transport implementation Add a VirtIO PCI transport implementation. When a Firecracker microVM is launched with --enable-pci, we will create all VirtIO devices using the PCI transport layer. Snapshotting of VirtIO PCI devices is not supported and we will add this functionality in later commit. Add a couple of tests that ensure that PCI configuration space is what expected. We read common fields and make sure the BAR we allocate for the VirtIO device is what expected. Signed-off-by: Babis Chalios --- Cargo.lock | 8 + src/vmm/Cargo.toml | 2 + src/vmm/src/builder.rs | 14 +- src/vmm/src/device_manager/mod.rs | 27 +- src/vmm/src/device_manager/pci_mngr.rs | 131 +- src/vmm/src/devices/virtio/device.rs | 2 +- src/vmm/src/devices/virtio/queue.rs | 13 + src/vmm/src/devices/virtio/transport/mod.rs | 2 + .../virtio/transport/pci/common_config.rs | 415 ++++++ .../devices/virtio/transport/pci/device.rs | 1279 +++++++++++++++++ .../src/devices/virtio/transport/pci/mod.rs | 5 + 11 files changed, 1887 insertions(+), 11 deletions(-) create mode 100644 src/vmm/src/devices/virtio/transport/pci/common_config.rs create mode 100644 src/vmm/src/devices/virtio/transport/pci/device.rs create mode 100644 src/vmm/src/devices/virtio/transport/pci/mod.rs diff --git a/Cargo.lock b/Cargo.lock index ade3eee8715..2f2bee5385b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -112,6 +112,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + [[package]] name = "arrayvec" version = "0.7.6" @@ -1657,11 +1663,13 @@ version = "0.1.0" dependencies = [ "acpi_tables", "aes-gcm", + "anyhow", "arrayvec", "aws-lc-rs", "base64", "bincode", "bitflags 2.9.1", + "byteorder", "crc64", "criterion", "derive_more", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index f71f74db7dd..40d79091abf 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -17,11 +17,13 @@ gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +anyhow = "1.0.98" arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.13.1", features = ["bindgen"] } base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.9.1" +byteorder = "1.5.0" crc64 = "2.0.0" derive_more = { version = "2.0.1", default-features = false, features = [ "from", diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 8bb4dff867b..5a255f5cf7b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -169,6 +169,8 @@ pub fn build_microvm_for_boot( let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; + let vm = Arc::new(vm); + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; @@ -271,7 +273,7 @@ pub fn build_microvm_for_boot( instance_info: instance_info.clone(), shutdown_exit_code: None, kvm, - vm: Arc::new(vm), + vm, uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, @@ -554,7 +556,7 @@ fn setup_pvtime( fn attach_entropy_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, @@ -571,7 +573,7 @@ fn attach_entropy_device( fn attach_block_devices<'a, I: Iterator>> + Debug>( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -600,7 +602,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( fn attach_net_devices<'a, I: Iterator>> + Debug>( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, @@ -616,7 +618,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( fn attach_unixsock_vsock_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, @@ -629,7 +631,7 @@ fn attach_unixsock_vsock_device( fn attach_balloon_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 8df4da2863d..da61db922c3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -79,6 +79,8 @@ pub enum AttachDeviceError { #[cfg(target_arch = "aarch64")] /// Error creating serial device: {0} CreateSerial(#[from] std::io::Error), + /// Error attach PCI device: {0} + PciTransport(#[from] PciManagerError), } #[derive(Debug)] @@ -160,8 +162,10 @@ impl DeviceManager { }) } - /// Attaches a VirtioDevice device to the device manager and event manager. - pub(crate) fn attach_virtio_device( + /// Attaches an MMIO VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_mmio_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( &mut self, vm: &Vm, id: String, @@ -184,6 +188,25 @@ impl DeviceManager { Ok(()) } + /// Attaches a VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_virtio_device( + &mut self, + vm: &Arc, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachDeviceError> { + if self.pci_devices.pci_segment.is_some() { + self.pci_devices + .attach_pci_virtio_device(vm, &self.resource_allocator, id, device)?; + } else { + self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; + } + + Ok(()) + } + /// Attaches a [`BootTimer`] to the VM pub(crate) fn attach_boot_timer_device( &mut self, diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index e9ada60cc1f..686349858fb 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -1,18 +1,29 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::sync::Arc; +use std::collections::HashMap; +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; +use event_manager::MutEventSubscriber; +use log::debug; +use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; use serde::{Deserialize, Serialize}; use vm_device::BusError; -use super::resources::ResourceAllocator; +use crate::Vm; +use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::vstate::vm::InterruptError; #[derive(Debug, Default)] pub struct PciDevices { /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. pub pci_segment: Option, + /// All VirtIO PCI devices of the system + pub virtio_devices: HashMap<(u32, String), Arc>>, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -21,6 +32,16 @@ pub enum PciManagerError { ResourceAllocation(#[from] vm_allocator::Error), /// Bus error: {0} Bus(#[from] BusError), + /// PCI root error: {0} + PciRoot(#[from] PciRootError), + /// MSI error: {0} + Msi(#[from] InterruptError), + /// VirtIO PCI device error: {0} + VirtioPciDevice(#[from] VirtioPciDeviceError), + /// PCI device error: {0} + PciDeviceError(#[from] PciDeviceError), + /// KVM error: {0} + Kvm(#[from] vmm_sys_util::errno::Error), } impl PciDevices { @@ -61,6 +82,112 @@ impl PciDevices { Ok(()) } + + fn register_bars_with_bus( + resource_allocator: &ResourceAllocator, + virtio_device: &Arc>, + ) -> Result<(), PciManagerError> { + for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { + match bar.region_type() { + PciBarRegionType::IoRegion => { + debug!( + "Inserting I/O BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + #[cfg(target_arch = "x86_64")] + resource_allocator.pio_bus.insert( + virtio_device.clone(), + bar.addr(), + bar.size(), + )?; + #[cfg(target_arch = "aarch64")] + log::error!("pci: We do not support I/O region allocation") + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + debug!( + "Inserting MMIO BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + resource_allocator.mmio_bus.insert( + virtio_device.clone(), + bar.addr(), + bar.size(), + )?; + } + } + } + + Ok(()) + } + + pub(crate) fn attach_pci_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( + &mut self, + vm: &Arc, + resource_allocator: &ResourceAllocator, + id: String, + device: Arc>, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let pci_device_bdf = pci_segment.next_device_bdf()?; + debug!("Allocating BDF: {pci_device_bdf:?} for device"); + let mem = vm.guest_memory().clone(); + + // Allocate one MSI vector per queue, plus one for configuration + let msix_num = + u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); + + let msix_vectors = Arc::new(Vm::create_msix_group( + vm.clone(), + resource_allocator, + msix_num, + )?); + + // Create the transport + let mut virtio_device = + VirtioPciDevice::new(id.clone(), mem, device, msix_vectors, pci_device_bdf.into())?; + + // Allocate bars + let mut mmio32_allocator = resource_allocator + .mmio32_memory + .lock() + .expect("Poisoned lock"); + let mut mmio64_allocator = resource_allocator + .mmio64_memory + .lock() + .expect("Poisoned lock"); + + virtio_device.allocate_bars(&mut mmio32_allocator, &mut mmio64_allocator, None)?; + + let virtio_device = Arc::new(Mutex::new(virtio_device)); + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + Ok(()) + } + + /// Gets the specified device. + pub fn get_virtio_device( + &self, + device_type: u32, + device_id: &str, + ) -> Option<&Arc>> { + self.virtio_devices + .get(&(device_type, device_id.to_string())) + } } #[derive(Default, Debug, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 083cd1bb54f..49ac1802447 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -148,7 +148,7 @@ pub trait VirtioDevice: AsAny + Send { /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { + fn reset(&mut self) -> Option<(Arc, Vec)> { None } diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 53174c0891b..84cb60dd59e 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -712,6 +712,19 @@ impl Queue { new - used_event - Wrapping(1) < new - old } + + /// Resets the Virtio Queue + pub(crate) fn reset(&mut self) { + self.ready = false; + self.size = self.max_size; + self.desc_table_address = GuestAddress(0); + self.avail_ring_address = GuestAddress(0); + self.used_ring_address = GuestAddress(0); + self.next_avail = Wrapping(0); + self.next_used = Wrapping(0); + self.num_added = Wrapping(0); + self.uses_notif_suppression = false; + } } #[cfg(kani)] diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index d41ad943aa2..c16a7adbe9d 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -8,6 +8,8 @@ use vmm_sys_util::eventfd::EventFd; /// MMIO transport for VirtIO devices pub mod mmio; +/// PCI transport for VirtIO devices +pub mod pci; /// Represents the types of interrupts used by VirtIO devices #[derive(Debug, Clone)] diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs new file mode 100644 index 00000000000..c8ee2d1d2a9 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -0,0 +1,415 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; + +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::logger::{debug, error, info, trace, warn}; +pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioPciCommonConfigState { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: u16, + pub msix_queues: Vec, +} + +// The standard layout for the ring is a continuous chunk of memory which looks +// like this. We assume num is a power of 2. +// +// struct vring +// { +// // The actual descriptors (16 bytes each) +// struct vring_desc desc[num]; +// +// // A ring of available descriptor heads with free-running index. +// __virtio16 avail_flags; +// __virtio16 avail_idx; +// __virtio16 available[num]; +// __virtio16 used_event_idx; +// +// // Padding to the next align boundary. +// char pad[]; +// +// // A ring of used descriptor heads with free-running index. +// __virtio16 used_flags; +// __virtio16 used_idx; +// struct vring_used_elem used[num]; +// __virtio16 avail_event_idx; +// }; +// struct vring_desc { +// __virtio64 addr; +// __virtio32 len; +// __virtio16 flags; +// __virtio16 next; +// }; +// +// struct vring_avail { +// __virtio16 flags; +// __virtio16 idx; +// __virtio16 ring[]; +// }; +// +// // u32 is used here for ids for padding reasons. +// struct vring_used_elem { +// // Index of start of used descriptor chain. +// __virtio32 id; +// // Total length of the descriptor chain which was used (written to) +// __virtio32 len; +// }; +// +// Kernel header used for this reference: include/uapi/linux/virtio_ring.h +// Virtio Spec: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html +// +const VRING_DESC_ELEMENT_SIZE: usize = 16; +const VRING_AVAIL_ELEMENT_SIZE: usize = 2; +const VRING_USED_ELEMENT_SIZE: usize = 8; +#[derive(Debug)] +pub enum VringType { + Desc, + Avail, + Used, +} + +pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { + let (length_except_ring, element_size) = match t { + VringType::Desc => (0, VRING_DESC_ELEMENT_SIZE), + VringType::Avail => (6, VRING_AVAIL_ELEMENT_SIZE), + VringType::Used => (6, VRING_USED_ELEMENT_SIZE), + }; + (length_except_ring + element_size * queue_size as usize) as u64 +} + +/// Contains the data for reading and writing the common configuration structure of a virtio PCI +/// device. +/// +/// * Registers: +/// +/// ** About the whole device. +/// le32 device_feature_select; // 0x00 // read-write +/// le32 device_feature; // 0x04 // read-only for driver +/// le32 driver_feature_select; // 0x08 // read-write +/// le32 driver_feature; // 0x0C // read-write +/// le16 msix_config; // 0x10 // read-write +/// le16 num_queues; // 0x12 // read-only for driver +/// u8 device_status; // 0x14 // read-write (driver_status) +/// u8 config_generation; // 0x15 // read-only for driver +/// +/// ** About a specific virtqueue. +/// le16 queue_select; // 0x16 // read-write +/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. +/// le16 queue_msix_vector; // 0x1A // read-write +/// le16 queue_enable; // 0x1C // read-write (Ready) +/// le16 queue_notify_off; // 0x1E // read-only for driver +/// le64 queue_desc; // 0x20 // read-write +/// le64 queue_avail; // 0x28 // read-write +/// le64 queue_used; // 0x30 // read-write +#[derive(Debug)] +pub struct VirtioPciCommonConfig { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: Arc, + pub msix_queues: Arc>>, +} + +impl VirtioPciCommonConfig { + pub fn new(state: VirtioPciCommonConfigState) -> Self { + VirtioPciCommonConfig { + driver_status: state.driver_status, + config_generation: state.config_generation, + device_feature_select: state.device_feature_select, + driver_feature_select: state.driver_feature_select, + queue_select: state.queue_select, + msix_config: Arc::new(AtomicU16::new(state.msix_config)), + msix_queues: Arc::new(Mutex::new(state.msix_queues)), + } + } + + fn state(&self) -> VirtioPciCommonConfigState { + VirtioPciCommonConfigState { + driver_status: self.driver_status, + config_generation: self.config_generation, + device_feature_select: self.device_feature_select, + driver_feature_select: self.driver_feature_select, + queue_select: self.queue_select, + msix_config: self.msix_config.load(Ordering::Acquire), + msix_queues: self.msix_queues.lock().unwrap().clone(), + } + } + + pub fn read(&mut self, offset: u64, data: &mut [u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => { + let v = self.read_common_config_byte(offset); + data[0] = v; + } + 2 => { + let v = self.read_common_config_word(offset, device.lock().unwrap().queues()); + LittleEndian::write_u16(data, v); + } + 4 => { + let v = self.read_common_config_dword(offset, device); + LittleEndian::write_u32(data, v); + } + 8 => { + let v = self.read_common_config_qword(offset); + LittleEndian::write_u64(data, v); + } + _ => error!("invalid data length for virtio read: len {}", data.len()), + } + } + + pub fn write(&mut self, offset: u64, data: &[u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => self.write_common_config_byte(offset, data[0]), + 2 => self.write_common_config_word( + offset, + LittleEndian::read_u16(data), + device.lock().unwrap().queues_mut(), + ), + 4 => self.write_common_config_dword(offset, LittleEndian::read_u32(data), device), + 8 => self.write_common_config_qword( + offset, + LittleEndian::read_u64(data), + device.lock().unwrap().queues_mut(), + ), + _ => error!("invalid data length for virtio write: len {}", data.len()), + } + } + + fn read_common_config_byte(&self, offset: u64) -> u8 { + debug!("read_common_config_byte: offset 0x{:x}", offset); + // The driver is only allowed to do aligned, properly sized access. + match offset { + 0x14 => self.driver_status, + 0x15 => self.config_generation, + _ => { + warn!("invalid virtio config byte read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_byte(&mut self, offset: u64, value: u8) { + debug!("write_common_config_byte: offset 0x{offset:x}: {value:x}"); + match offset { + 0x14 => self.driver_status = value, + _ => { + warn!("invalid virtio config byte write: 0x{:x}", offset); + } + } + } + + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { + debug!("read_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.load(Ordering::Acquire), + 0x12 => queues.len().try_into().unwrap(), // num_queues + 0x16 => self.queue_select, + 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + 0x1e => self.queue_select, // notify_off + _ => { + warn!("invalid virtio register word read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { + debug!("write_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.store(value, Ordering::Release), + 0x16 => self.queue_select = value, + 0x18 => self.with_queue_mut(queues, |q| q.size = value), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1c => self.with_queue_mut(queues, |q| { + q.ready = value == 1; + }), + _ => { + warn!("invalid virtio register word write: 0x{:x}", offset); + } + } + } + + fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + debug!("read_common_config_dword: offset 0x{:x}", offset); + match offset { + 0x00 => self.device_feature_select, + 0x04 => { + let locked_device = device.lock().unwrap(); + // Only 64 bits of features (2 pages) are defined for now, so limit + // device_feature_select to avoid shifting by 64 or more bits. + if self.device_feature_select < 2 { + ((locked_device.avail_features() >> (self.device_feature_select * 32)) + & 0xffff_ffff) as u32 + } else { + 0 + } + } + 0x08 => self.driver_feature_select, + _ => { + warn!("invalid virtio register dword read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_dword( + &mut self, + offset: u64, + value: u32, + device: Arc>, + ) { + debug!("write_common_config_dword: offset 0x{:x}", offset); + fn hi(v: &mut GuestAddress, x: u32) { + *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) + } + + fn lo(v: &mut GuestAddress, x: u32) { + *v = (*v & !0xffff_ffff) | u64::from(x) + } + + let mut locked_device = device.lock().unwrap(); + + match offset { + 0x00 => self.device_feature_select = value, + 0x08 => self.driver_feature_select = value, + 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), + 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.desc_table_address, value) + }), + 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.desc_table_address, value) + }), + 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.avail_ring_address, value) + }), + 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.avail_ring_address, value) + }), + 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.used_ring_address, value) + }), + 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.used_ring_address, value) + }), + _ => { + warn!("invalid virtio register dword write: 0x{:x}", offset); + } + } + } + + fn read_common_config_qword(&self, _offset: u64) -> u64 { + debug!("read_common_config_qword: offset 0x{:x}", _offset); + 0 // Assume the guest has no reason to read write-only registers. + } + + fn write_common_config_qword(&mut self, offset: u64, value: u64, queues: &mut [Queue]) { + debug!("write_common_config_qword: offset 0x{:x}", offset); + + let low = Some((value & 0xffff_ffff) as u32); + let high = Some((value >> 32) as u32); + + match offset { + 0x20 => self.with_queue_mut(queues, |q| q.desc_table_address.0 = value), + 0x28 => self.with_queue_mut(queues, |q| q.avail_ring_address.0 = value), + 0x30 => self.with_queue_mut(queues, |q| q.used_ring_address.0 = value), + _ => { + warn!("invalid virtio register qword write: 0x{:x}", offset); + } + } + } + + fn with_queue(&self, queues: &[Queue], f: F) -> Option + where + F: FnOnce(&Queue) -> U, + { + queues.get(self.queue_select as usize).map(f) + } + + fn with_queue_mut(&self, queues: &mut [Queue], f: F) { + if let Some(queue) = queues.get_mut(self.queue_select as usize) { + f(queue); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; + + #[test] + fn write_base_regs() { + let mut regs = VirtioPciCommonConfig { + driver_status: 0xaa, + config_generation: 0x55, + device_feature_select: 0x0, + driver_feature_select: 0x0, + queue_select: 0xff, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 3])), + }; + + let dev = Arc::new(Mutex::new(DummyDevice::new())); + // Can set all bits of driver_status. + regs.write(0x14, &[0x55], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x14, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // The config generation register is read only. + regs.write(0x15, &[0xaa], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x15, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // Device features is read-only and passed through from the device. + regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x04, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0u32); + + // Feature select registers are read/write. + regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x00, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x08, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + + // 'queue_select' can be read and written. + regs.write(0x16, &[0xaa, 0x55], dev.clone()); + let mut read_back = vec![0x00, 0x00]; + regs.read(0x16, &mut read_back, dev); + assert_eq!(read_back[0], 0xaa); + assert_eq!(read_back[1], 0x55); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs new file mode 100644 index 00000000000..20c169297fd --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -0,0 +1,1279 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::cmp; +use std::fmt::{Debug, Formatter}; +use std::io::Write; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use kvm_ioctls::{IoEventAddress, NoDatamatch}; +use pci::{ + BarReprogrammingParams, MsixCap, MsixConfig, MsixConfigState, PciBarConfiguration, + PciBarRegionType, PciBdf, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, + PciConfigurationState, PciDevice, PciDeviceError, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_allocator::{AddressAllocator, AllocPolicy, RangeInclusive}; +use vm_device::interrupt::{InterruptIndex, InterruptSourceGroup, MsiIrqGroupConfig}; +use vm_device::{BusDevice, PciBarType, Resource}; +use vm_memory::{Address, ByteValued, GuestAddress, Le32}; +use vmm_sys_util::errno; +use vmm_sys_util::eventfd::EventFd; + +use crate::Vm; +use crate::device_manager::resources::ResourceAllocator; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::common_config::{ + VirtioPciCommonConfig, VirtioPciCommonConfigState, +}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; +use crate::logger::{debug, error}; +use crate::utils::u64_to_usize; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; + +const DEVICE_INIT: u8 = 0x00; +const DEVICE_ACKNOWLEDGE: u8 = 0x01; +const DEVICE_DRIVER: u8 = 0x02; +const DEVICE_DRIVER_OK: u8 = 0x04; +const DEVICE_FEATURES_OK: u8 = 0x08; +const DEVICE_FAILED: u8 = 0x80; + +const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +const VIRTIO_F_VERSION_1: u32 = 32; +const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_IN_ORDER: u32 = 35; +const VIRTIO_F_ORDER_PLATFORM: u32 = 36; +#[allow(dead_code)] +const VIRTIO_F_SR_IOV: u32 = 37; +const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; + +/// Vector value used to disable MSI for a queue. +const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; + +enum PciCapabilityType { + Common = 1, + Notify = 2, + Isr = 3, + Device = 4, + Pci = 5, + SharedMemory = 8, +} + +// This offset represents the 2 bytes omitted from the VirtioPciCap structure +// as they are already handled through add_capability(). These 2 bytes are the +// fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec. +const VIRTIO_PCI_CAP_OFFSET: usize = 2; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCap { + cap_len: u8, // Generic PCI field: capability length + cfg_type: u8, // Identifies the structure. + pci_bar: u8, // Where to find it. + id: u8, // Multiple capabilities of the same type + padding: [u8; 2], // Pad to full dword. + offset: Le32, // Offset within bar. + length: Le32, // Length of the structure, in bytes. +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap {} + +impl PciCapability for VirtioPciCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; + +impl VirtioPciCap { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self { + VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciNotifyCap { + cap: VirtioPciCap, + notify_off_multiplier: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciNotifyCap {} + +impl PciCapability for VirtioPciNotifyCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciNotifyCap { + pub fn new( + cfg_type: PciCapabilityType, + pci_bar: u8, + offset: u32, + length: u32, + multiplier: Le32, + ) -> Self { + VirtioPciNotifyCap { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + }, + notify_off_multiplier: multiplier, + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap64 { + cap: VirtioPciCap, + offset_hi: Le32, + length_hi: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap64 {} + +impl PciCapability for VirtioPciCap64 { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCap64 { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self { + VirtioPciCap64 { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id, + padding: [0; 2], + offset: Le32::from((offset & 0xffff_ffff) as u32), + length: Le32::from((length & 0xffff_ffff) as u32), + }, + offset_hi: Le32::from((offset >> 32) as u32), + length_hi: Le32::from((length >> 32) as u32), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCap { + cap: VirtioPciCap, + pci_cfg_data: [u8; 4], +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCfgCap {} + +impl PciCapability for VirtioPciCfgCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCfgCap { + fn new() -> Self { + VirtioPciCfgCap { + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0, 0), + ..Default::default() + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCapInfo { + offset: usize, + cap: VirtioPciCfgCap, +} + +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum PciVirtioSubclass { + NonTransitionalBase = 0xff, +} + +impl PciSubclass for PciVirtioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +// Allocate one bar for the structs pointed to by the capability structures. +// As per the PCI specification, because the same BAR shares MSI-X and non +// MSI-X structures, it is recommended to use 8KiB alignment for all those +// structures. +const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; +const COMMON_CONFIG_SIZE: u64 = 56; +const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_SIZE: u64 = 1; +const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_SIZE: u64 = 0x1000; +const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; +const NOTIFICATION_SIZE: u64 = 0x1000; +const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +// The size is 256KiB because the table can hold up to 2048 entries, with each +// entry being 128 bits (4 DWORDS). +const MSIX_TABLE_SIZE: u64 = 0x40000; +const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +// The size is 2KiB because the Pending Bit Array has one bit per vector and it +// can support up to 2048 vectors. +const MSIX_PBA_SIZE: u64 = 0x800; +// The BAR size must be a power of 2. +const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const VIRTIO_COMMON_BAR_INDEX: usize = 0; +const VIRTIO_SHM_BAR_INDEX: usize = 2; + +const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + +const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; +const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. + +#[derive(Debug, Serialize, Deserialize)] +struct QueueState { + max_size: u16, + size: u16, + ready: bool, + desc_table: u64, + avail_ring: u64, + used_ring: u64, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct VirtioPciDeviceState { + pub pci_device_bdf: PciBdf, + device_activated: bool, + queues: Vec, + interrupt_status: usize, + cap_pci_cfg_offset: usize, + cap_pci_cfg: Vec, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum VirtioPciDeviceError { + /// Failed creating VirtioPciDevice: {0} + CreateVirtioPciDevice(#[from] anyhow::Error), + /// Error creating MSI configuration: {0} + Msi(#[from] pci::MsixError), +} +pub type Result = std::result::Result; + +pub struct VirtioPciDevice { + id: String, + + // BDF assigned to the device + pci_device_bdf: PciBdf, + + // PCI configuration registers. + configuration: PciConfiguration, + + // virtio PCI common configuration + common_config: VirtioPciCommonConfig, + + // MSI-X config + msix_config: Option>>, + + // Number of MSI-X vectors + msix_num: u16, + + // Virtio device reference and status + device: Arc>, + device_activated: Arc, + + // PCI interrupts. + interrupt_status: Arc, + virtio_interrupt: Option>, + interrupt_source_group: Arc, + + // Guest memory + memory: GuestMemoryMmap, + + // Settings PCI BAR + settings_bar: u8, + + // Whether to use 64-bit bar location or 32-bit + use_64bit_bar: bool, + + // Add a dedicated structure to hold information about the very specific + // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support + // the legacy/backward compatible mechanism of letting the guest access the + // other virtio capabilities without mapping the PCI BARs. This can be + // needed when the guest tries to early access the virtio configuration of + // a device. + cap_pci_cfg_info: VirtioPciCfgCapInfo, + + // Details of bar regions to free + pub bar_regions: Vec, +} + +impl Debug for VirtioPciDevice { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("VirtioPciDevice") + .field("id", &self.id) + .finish() + } +} + +impl VirtioPciDevice { + fn pci_configuration( + virtio_device_type: u32, + msix_config: &Arc>, + pci_config_state: Option, + ) -> PciConfiguration { + let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + u16::try_from(virtio_device_type).unwrap(); + let (class, subclass) = match virtio_device_type { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + Some(msix_config.clone()), + pci_config_state, + ) + } + + fn msix_config( + pci_device_bdf: u32, + msix_vectors: Arc, + msix_config_state: Option, + ) -> Result>> { + let msix_config = Arc::new(Mutex::new(MsixConfig::new( + msix_vectors.num_vectors(), + msix_vectors, + pci_device_bdf, + msix_config_state, + )?)); + + Ok(msix_config) + } + + /// Constructs a new PCI transport for the given virtio device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + pci_device_bdf: u32, + ) -> Result { + let num_queues = device.lock().expect("Poisoned lock").queues().len(); + + let msix_config = Self::msix_config(pci_device_bdf, msi_vectors.clone(), None)?; + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + None, + ); + + let virtio_common_config = VirtioPciCommonConfig::new(VirtioPciCommonConfigState { + driver_status: 0, + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: VIRTQ_MSI_NO_VECTOR, + msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], + }); + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: pci_device_bdf.into(), + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(false)), + interrupt_status: Arc::new(AtomicUsize::new(0)), + virtio_interrupt: None, + memory, + settings_bar: 0, + use_64bit_bar: true, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info: VirtioPciCfgCapInfo::default(), + bar_regions: vec![], + }; + + Ok(virtio_pci_device) + } + + fn is_driver_ready(&self) -> bool { + let ready_bits = + (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); + self.common_config.driver_status == ready_bits + && self.common_config.driver_status & DEVICE_FAILED == 0 + } + + /// Determines if the driver has requested the device (re)init / reset itself + fn is_driver_init(&self) -> bool { + self.common_config.driver_status == DEVICE_INIT + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(self.settings_bar as usize) + } + + fn add_pci_capabilities( + &mut self, + settings_bar: u8, + ) -> std::result::Result<(), PciDeviceError> { + // Add pointers to the different configuration structures from the PCI capabilities. + let common_cap = VirtioPciCap::new( + PciCapabilityType::Common, + settings_bar, + COMMON_CONFIG_BAR_OFFSET.try_into().unwrap(), + COMMON_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&common_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let isr_cap = VirtioPciCap::new( + PciCapabilityType::Isr, + settings_bar, + ISR_CONFIG_BAR_OFFSET.try_into().unwrap(), + ISR_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&isr_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + // TODO(dgreid) - set based on device's configuration size? + let device_cap = VirtioPciCap::new( + PciCapabilityType::Device, + settings_bar, + DEVICE_CONFIG_BAR_OFFSET.try_into().unwrap(), + DEVICE_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&device_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let notify_cap = VirtioPciNotifyCap::new( + PciCapabilityType::Notify, + settings_bar, + NOTIFICATION_BAR_OFFSET.try_into().unwrap(), + NOTIFICATION_SIZE.try_into().unwrap(), + Le32::from(NOTIFY_OFF_MULTIPLIER), + ); + self.configuration + .add_capability(¬ify_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let configuration_cap = VirtioPciCfgCap::new(); + self.cap_pci_cfg_info.offset = self + .configuration + .add_capability(&configuration_cap) + .map_err(PciDeviceError::CapabilitiesSetup)? + + VIRTIO_PCI_CAP_OFFSET; + self.cap_pci_cfg_info.cap = configuration_cap; + + if self.msix_config.is_some() { + let msix_cap = MsixCap::new( + settings_bar, + self.msix_num, + MSIX_TABLE_BAR_OFFSET.try_into().unwrap(), + settings_bar, + MSIX_PBA_BAR_OFFSET.try_into().unwrap(), + ); + self.configuration + .add_capability(&msix_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + } + + self.settings_bar = settings_bar; + Ok(()) + } + + fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) { + let cap_slice = self.cap_pci_cfg_info.cap.as_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to read cap_pci_cfg from config space"); + return; + } + + if offset < std::mem::size_of::() { + if let Some(end) = offset.checked_add(data_len) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&cap_slice[offset..cmp::min(end, cap_len)]) + .unwrap(); + } + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.read_bar(0, bar_offset as u64, data) + } + } + + fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option> { + let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to write cap_pci_cfg to config space"); + return None; + } + + if offset < std::mem::size_of::() { + let (_, right) = cap_slice.split_at_mut(offset); + right[..data_len].copy_from_slice(data); + None + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.write_bar(0, bar_offset as u64, data) + } + } + + pub fn virtio_device(&self) -> Arc> { + self.device.clone() + } + + fn needs_activation(&self) -> bool { + !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() + } + + /// Register the IoEvent notification for a VirtIO device + pub fn register_notification_ioevent(&self, vm: &Vm) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } + + /// Unregister the IoEvent notification for a VirtIO device + pub fn unregister_notification_ioevent( + &self, + vm: &Vm, + ) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd() + .unregister_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } +} + +pub struct VirtioInterruptMsix { + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, +} + +impl std::fmt::Debug for VirtioInterruptMsix { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VirtioInterruptMsix") + .field("msix_config", &self.msix_config) + .field("config_vector", &self.config_vector) + .field("queues_vectors", &self.queues_vectors) + .finish() + } +} + +impl VirtioInterruptMsix { + pub fn new( + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, + ) -> Self { + VirtioInterruptMsix { + msix_config, + config_vector, + queues_vectors, + interrupt_source_group, + } + } +} + +impl VirtioInterrupt for VirtioInterruptMsix { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + if vector == VIRTQ_MSI_NO_VECTOR { + return Ok(()); + } + + let config = &mut self.msix_config.lock().unwrap(); + let entry = &config.table_entries[vector as usize]; + // In case the vector control register associated with the entry + // has its first bit set, this means the vector is masked and the + // device should not inject the interrupt. + // Instead, the Pending Bit Array table is updated to reflect there + // is a pending interrupt for this specific vector. + if config.masked() || entry.masked() { + config.set_pba_bit(vector, false); + return Ok(()); + } + + self.interrupt_source_group + .trigger(vector as InterruptIndex) + } + + fn notifier(&self, int_type: VirtioInterruptType) -> Option<&EventFd> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + self.interrupt_source_group + .notifier(vector as InterruptIndex) + } + + fn status(&self) -> Arc { + Arc::new(AtomicU32::new(0)) + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + false + } +} + +impl PciDevice for VirtioPciDevice { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base + u64_to_usize(offset) >= self.cap_pci_cfg_info.offset + && base + u64_to_usize(offset) + data.len() + <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base + u64_to_usize(offset) - self.cap_pci_cfg_info.offset; + self.write_cap_pci_cfg(offset, data) + } else { + self.configuration + .write_config_register(reg_idx, offset, data); + None + } + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base >= self.cap_pci_cfg_info.offset + && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base - self.cap_pci_cfg_info.offset; + let mut data = [0u8; 4]; + self.read_cap_pci_cfg(offset, &mut data); + u32::from_le_bytes(data) + } else { + self.configuration.read_reg(reg_idx) + } + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn allocate_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> std::result::Result, PciDeviceError> { + let mut bars = Vec::new(); + let device_clone = self.device.clone(); + let device = device_clone.lock().unwrap(); + + let mut settings_bar_addr = None; + let mut use_64bit_bar = self.use_64bit_bar; + let restoring = resources.is_some(); + if let Some(resources) = resources { + for resource in resources { + if let Resource::PciBar { + index, base, type_, .. + } = resource + { + if index == VIRTIO_COMMON_BAR_INDEX { + settings_bar_addr = Some(GuestAddress(base)); + use_64bit_bar = match type_ { + PciBarType::Io => { + return Err(PciDeviceError::InvalidResource(resource)); + } + PciBarType::Mmio32 => false, + PciBarType::Mmio64 => true, + }; + break; + } + } + } + // Error out if no resource was matching the BAR id. + if settings_bar_addr.is_none() { + return Err(PciDeviceError::MissingResource); + } + } + + // Allocate the virtio-pci capability BAR. + // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 + let policy = match settings_bar_addr { + Some(addr) => AllocPolicy::ExactMatch(addr.0), + None => AllocPolicy::FirstMatch, + }; + let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let region_type = PciBarRegionType::Memory64BitRegion; + let addr = mmio64_allocator + .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .unwrap() + .start(); + (addr, region_type) + } else { + let region_type = PciBarRegionType::Memory32BitRegion; + let addr = mmio32_allocator + .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .unwrap() + .start(); + (addr, region_type) + }; + + let bar = PciBarConfiguration::default() + .set_index(VIRTIO_COMMON_BAR_INDEX) + .set_address(virtio_pci_bar_addr) + .set_size(CAPABILITY_BAR_SIZE) + .set_region_type(region_type); + + // The creation of the PCI BAR and its associated capabilities must + // happen only during the creation of a brand new VM. When a VM is + // restored from a known state, the BARs are already created with the + // right content, therefore we don't need to go through this codepath. + if !restoring { + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; + } + + bars.push(bar); + + self.bar_regions.clone_from(&bars); + + Ok(bars) + } + + fn free_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + for bar in self.bar_regions.drain(..) { + let range = RangeInclusive::new(bar.addr(), bar.addr() + bar.size()).unwrap(); + match bar.region_type() { + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(&range); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(&range); + } + _ => error!("Unexpected PCI bar type"), + } + } + Ok(()) + } + + fn move_bar( + &mut self, + old_base: u64, + new_base: u64, + ) -> std::result::Result<(), std::io::Error> { + // We only update our idea of the bar in order to support free_bars() above. + // The majority of the reallocation is done inside DeviceManager. + for bar in self.bar_regions.iter_mut() { + if bar.addr() == old_base { + *bar = bar.set_address(new_base); + } + } + + Ok(()) + } + + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .read(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.get_mut(0) { + // Reading this register resets it to 0. + *v = self + .interrupt_status + .swap(0, Ordering::AcqRel) + .try_into() + .unwrap(); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let device = self.device.lock().unwrap(); + device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + } + } + + fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .write(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.first() { + self.interrupt_status + .fetch_and(!(*v as usize), Ordering::AcqRel); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let mut device = self.device.lock().unwrap(); + device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + error!("Unexpected write to notification BAR: offset = 0x{:x}", o); + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + }; + + // Try and activate the device if the driver status has changed + if self.needs_activation() { + debug!("Activating device"); + self.virtio_device() + .lock() + .unwrap() + .activate( + self.memory.clone(), + Arc::clone(self.virtio_interrupt.as_ref().unwrap()), + ) + .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + } else { + debug!("Device doesn't need activation"); + } + + // Device has been reset by the driver + if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() { + let mut device = self.device.lock().unwrap(); + let reset_result = device.reset(); + match reset_result { + Some((virtio_interrupt, mut _queue_evts)) => { + // Upon reset the device returns its interrupt EventFD + self.virtio_interrupt = Some(virtio_interrupt); + self.device_activated.store(false, Ordering::SeqCst); + + // Reset queue readiness (changes queue_enable), queue sizes + // and selected_queue as per spec for reset + self.virtio_device() + .lock() + .unwrap() + .queues_mut() + .iter_mut() + .for_each(Queue::reset); + self.common_config.queue_select = 0; + } + None => { + error!("Attempt to reset device when not implemented in underlying device"); + self.common_config.driver_status = DEVICE_FAILED; + } + } + } + + None + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +impl BusDevice for VirtioPciDevice { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.write_bar(base, offset, data) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use event_manager::MutEventSubscriber; + use linux_loader::loader::Cmdline; + use pci::{PciBdf, PciClassCode, PciDevice, PciSubclass}; + + use super::VirtioPciDevice; + use crate::Vm; + use crate::arch::MEM_64BIT_DEVICES_START; + use crate::builder::tests::default_vmm; + use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::rng::Entropy; + use crate::devices::virtio::transport::pci::device::PciVirtioSubclass; + use crate::rate_limiter::RateLimiter; + + #[test] + fn test_pci_device_config() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // For more information for the values we are checking here look into the VirtIO spec here: + // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1220007 + // and PCI Header type 0 layout here: https://wiki.osdev.org/PCI#Configuration_Space + + // | 16 bits | 16 bits | + // |-----------|-----------| + // regiger 0x0: | Device ID | Vendor ID | + // + // Vendor ID of VirtIO devices is 0x1af4 + let reg0 = locked_virtio_pci_device.read_config_register(0); + assert_eq!(reg0 & 0xffff, 0x1af4); + // VirtIO PCI device IDs are in the range [0x1000, 0x107f]. (We are not using transitional + // device IDs). + let devid = reg0 >> 16; + assert!( + (0x1000..=0x107f).contains(&devid), + "Device ID check: {:#x} >= 0x1000 && {:#x} <= 0x107f", + devid, + devid + ); + + // | 16 bits | 16 bits | + // |------------|-----------| + // regiger 0x1: | Status | Command | + // We offer the capabilities list (bit 4 of status register) at offset 0x34 + let reg1 = locked_virtio_pci_device.read_config_register(1); + assert_eq!(reg1, 0x0010_0000); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x2: | Class code | Subclass | Prog IF | Revision ID | + // + // Class code: VIRTIO_PCI_VENDOR_ID for all VirtIO devices + // Subclass: PciClassCode::NetworkController for net, PciClassCode::MassStore for block + // PciClassCode::Other for everything else + // Prog IF: A register defining some programmable interface register. 0 for VirtIO devices + // Revision ID: 0x1 for modern VirtIO devices + let reg2 = locked_virtio_pci_device.read_config_register(2); + assert_eq!(reg2, 0xffff_0001); + let class_code = ((reg2 >> 24) & 0xff) as u8; + assert_eq!(class_code, PciClassCode::Other.get_register_value()); + let subclass = ((reg2 >> 16) & 0xff) as u8; + assert_eq!( + subclass, + PciVirtioSubclass::NonTransitionalBase.get_register_value() + ); + let prog_if = ((reg2 >> 8) & 0xff) as u8; + assert_eq!(prog_if, 0); + let revision_id = reg2 & 0xff; + assert_eq!(revision_id, 0x1); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x3: | BIST | Header Type | Latency timer | Cache line size | + // + // BIST: status and control for self test of PCI devices. Always 0 for VirtIO devices + // HeaderType: 0x0 for general devices + // LatencyTimer: Latency timer in units of PCI bus clocks, 0 for VirtIO + // Cache Line size: 0 for VirtIO devices + let reg3 = locked_virtio_pci_device.read_config_register(3); + assert_eq!(reg3, 0x0); + + // register 0xa: Cardbus CIS pointer + // + // We don't emulate CardBus + let reg10 = locked_virtio_pci_device.read_config_register(0xa); + assert_eq!(reg10, 0); + + // | 16 bits | 16 bits | + // regiger 0xb: | Subsystem ID | Subsystem vendor ID| + // + // For us Subsystem ID is same as device ID and subsystem vendor ID is same as vendor ID + // (reg 0x0) + let reg11 = locked_virtio_pci_device.read_config_register(0xb); + assert_eq!(reg11, reg0); + + // register 0xc: Expansion ROM base address: 0x0 for us + let reg12 = locked_virtio_pci_device.read_config_register(0xc); + assert_eq!(reg12, 0); + + // | 24 bits | 8 bits | + // register 0xd: | Reserved | Capabilities pointer | + let reg13 = locked_virtio_pci_device.read_config_register(0xd); + assert_eq!(reg13 >> 24, 0); + + // register 0xe: Reserved + let reg14 = locked_virtio_pci_device.read_config_register(0xe); + assert_eq!(reg14, 0); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0xf: | max latency | min grant | Interrupt pin | Interrupt line | + // + // We don't specify any of those + let reg15 = locked_virtio_pci_device.read_config_register(0xf); + assert_eq!(reg15, 0); + } + + #[test] + fn test_reading_bars() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // According to OSdev wiki (https://wiki.osdev.org/PCI#Configuration_Space): + // + // When you want to retrieve the actual base address of a BAR, be sure to mask the lower + // bits. For 16-bit Memory Space BARs, you calculate (BAR[x] & 0xFFF0). For 32-bit Memory + // Space BARs, you calculate (BAR[x] & 0xFFFFFFF0). For 64-bit Memory Space BARs, you + // calculate ((BAR[x] & 0xFFFFFFF0) + ((BAR[x + 1] & 0xFFFFFFFF) << 32)) For I/O Space + // BARs, you calculate (BAR[x] & 0xFFFFFFFC). + + // We are allocating a single 64-bit MMIO bar for VirtIO capabilities list. As a result, we + // are using the first two BAR registers from the configuration space. + // + // The BAR address layout is as follows: + // + // | Bits 31-4 | Bit 3 | Bits 2-1 | Bit 0 | + // | 16-Byte Aligned Base Address | Prefetchable | Type | Always 0 | + // + // For 64-bit addresses though a second BAR is used to hold the upper 32 bits + // of the address. Prefetchable and type will be help in the lower bits of the + // first bar along with the lower 32-bits of the address which is always 16-bytes + // aligned. + let bar_addr_lo = locked_virtio_pci_device.read_config_register(0x4); + let bar_addr_hi = locked_virtio_pci_device.read_config_register(0x5); + let bar_addr = bar_addr_lo as u64 + ((bar_addr_hi as u64) << 32); + + // Bit 0 always 0 + assert_eq!(bar_addr & 0x1, 0); + // Type is 0x2 meaning 64-bit BAR + assert_eq!((bar_addr & 0x6) >> 1, 2); + // The actual address of the BAR should be the first available address of our 64-bit MMIO + // region + assert_eq!(bar_addr & 0xffff_ffff_ffff_fff0, MEM_64BIT_DEVICES_START); + + // Reading the BAR size is a bit more convoluted. According to OSDev wiki: + // + // To determine the amount of address space needed by a PCI device, you must save the + // original value of the BAR, write a value of all 1's to the register, then read it back. + // The amount of memory can then be determined by masking the information bits, performing + // a bitwise NOT ('~' in C), and incrementing the value by 1. + + locked_virtio_pci_device.write_config_register(0x4, 0, &[0xff, 0xff, 0xff, 0xff]); + // Read the lower size bits and mask out the last 4 bits include Prefetchable, Type and + // hardwired-0 + let bar_size_lo = locked_virtio_pci_device.read_config_register(0x4) as u64 & 0xfffffff0; + locked_virtio_pci_device.write_config_register(0x5, 0, &[0xff, 0xff, 0xff, 0xff]); + let bar_size_hi = locked_virtio_pci_device.read_config_register(0x5) as u64; + let bar_size = !((bar_size_hi << 32) | bar_size_lo) + 1; + + // We create a capabilities BAR region of 0x80000 bytes + assert_eq!(bar_size, 0x80000); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs new file mode 100644 index 00000000000..520b52274b3 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod common_config; +pub mod device; From e40fab17885004775b61c58dc49836a2105729aa Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 10:29:18 +0200 Subject: [PATCH 14/27] seccomp: allow new ioctls for vCPU threads We are now calling KVM_CHECK_EXTENSION for checking the KVM_CAP_MSI_DEVID capability. We are also calling KVM_SET_GSI_ROUTING to set the interrupts routes and KVM_IRQFD to set/unset interrupt lines. Signed-off-by: Babis Chalios --- .../seccomp/aarch64-unknown-linux-musl.json | 43 +++++++++++++++++++ .../seccomp/x86_64-unknown-linux-musl.json | 43 +++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index db3abe1eced..e3aaeaf911b 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -1017,6 +1017,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 95ceca1b7ef..3dcdbf659d1 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -1149,6 +1149,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } From 6e3495c0a1864c293a338c606597704b072d2ff5 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 15:07:11 +0200 Subject: [PATCH 15/27] pci: add unit tests to PciSegment Add some unit tests to PciSegment. We now test that the next_device_bdf() method and the initialization logic work as expected. We also check that the configuration space of the PCI segment is correctly registered with the MMIO and, on x86, PIO bus. Signed-off-by: Babis Chalios --- src/vmm/src/devices/pci/pci_segment.rs | 97 ++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index 169ffdcba3b..c1e8bb07cb8 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -462,3 +462,100 @@ impl Aml for PciSegment { .append_aml_bytes(v) } } + +#[cfg(test)] +mod tests { + + use super::*; + use crate::arch; + use crate::utils::u64_to_usize; + + #[test] + fn test_pci_segment_build() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + assert_eq!(pci_segment.id, 0); + assert_eq!( + pci_segment.start_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + arch::MEM_32BIT_DEVICES_SIZE - 1 + ); + assert_eq!( + pci_segment.start_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + arch::MEM_64BIT_DEVICES_SIZE - 1 + ); + assert_eq!(pci_segment.mmio_config_address, arch::PCI_MMCONFIG_START); + assert_eq!(pci_segment.proximity_domain, 0); + assert_eq!(pci_segment.pci_devices_up, 0); + assert_eq!(pci_segment.pci_devices_down, 0); + assert_eq!(pci_segment.pci_irq_slots, [0u8; 32]); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_io_bus() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; + resource_allocator + .pio_bus + .read(PCI_CONFIG_IO_PORT, &mut data) + .unwrap(); + + resource_allocator + .pio_bus + .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) + .unwrap_err(); + } + + #[test] + fn test_mmio_bus() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; + + resource_allocator + .mmio_bus + .read(pci_segment.mmio_config_address, &mut data) + .unwrap(); + resource_allocator + .mmio_bus + .read( + pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + &mut data, + ) + .unwrap_err(); + } + + #[test] + fn test_next_device_bdf() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + // Start checking from device id 1, since 0 is allocated to the Root port. + for dev_id in 1..32 { + let bdf = pci_segment.next_device_bdf().unwrap(); + // In our case we have a single Segment with id 0, which has + // a single bus with id 0. Also, each device of ours has a + // single function. + assert_eq!(bdf, PciBdf::new(0, 0, dev_id, 0)); + } + + // We can only have 32 devices on a segment + pci_segment.next_device_bdf().unwrap_err(); + } +} From b3feaa40ce37e5a52fd2f8e552c4ba2fb86ce823 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Jun 2025 15:34:45 +0200 Subject: [PATCH 16/27] device_manager: save resource allocator in snapshot vm-allocator now allows us to (De)serialize IdAllocator and AddressAllocator types. Add ResourceAllocator in DeviceManager snapshot state and restore it when loading a snapshot. Like this we can avoid doing the ExactMatch allocations during snapshot resumes for reserving the exact same MMIO ranges. Moreover, change DeviceManager and PciDevices to provide save/restore functionality via the Persist trait. Like that we can avoid first creating the objects and then restoring their state, overwriting their fields. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/vmm/Cargo.toml | 2 +- src/vmm/src/builder.rs | 22 ++-- src/vmm/src/device_manager/mod.rs | 158 +++++++++++++++--------- src/vmm/src/device_manager/pci_mngr.rs | 49 +++++--- src/vmm/src/device_manager/persist.rs | 38 ++---- src/vmm/src/device_manager/resources.rs | 130 ++++++++++++++++++- src/vmm/src/devices/acpi/vmgenid.rs | 5 - src/vmm/src/lib.rs | 1 + src/vmm/src/persist.rs | 1 + 10 files changed, 283 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2f2bee5385b..6865762136f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1623,6 +1623,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c2fce39487bd03b5b0ab176f584682e9eaab7875254bafd3d188c69c85fce6e" dependencies = [ "libc", + "serde", "thiserror 2.0.12", ] diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 40d79091abf..8dd8192e42d 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -52,7 +52,7 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } -vm-allocator = "0.1.2" +vm-allocator = { version = "0.1.2", features = ["serde"] } vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.2", features = [ "backend-mmap", diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5a255f5cf7b..3284b11f559 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -42,6 +42,7 @@ use crate::logger::debug; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; +use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::{Kvm, KvmError}; @@ -411,8 +412,6 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm).unwrap(); - vm.register_memory_regions(guest_memory) .map_err(StartMicrovmError::Vm)?; @@ -430,16 +429,6 @@ pub fn build_microvm_from_snapshot( } } - // Restore allocator state - #[cfg(target_arch = "aarch64")] - if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { - allocate_pvtime_region( - &mut device_manager, - vcpus.len(), - vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), - )?; - } - // Restore vcpus kvm state. for (vcpu, state) in vcpus.iter_mut().zip(microvm_state.vcpu_states.iter()) { vcpu.kvm_vcpu @@ -463,6 +452,9 @@ pub fn build_microvm_from_snapshot( vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. + // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation + // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise + // the injected interrupt will be overwritten. let device_ctor_args = DeviceRestoreArgs { mem: vm.guest_memory(), vm: &vm, @@ -470,9 +462,11 @@ pub fn build_microvm_from_snapshot( vm_resources, instance_id: &instance_info.id, restored_from_file: uffd.is_none(), + vcpus_exit_evt: &vcpus_exit_evt, }; - - device_manager.restore(µvm_state.device_states, device_ctor_args)?; + #[allow(unused_mut)] + let mut device_manager = + DeviceManager::restore(device_ctor_args, µvm_state.device_states)?; let mut vmm = Vmm { events_observer: Some(std::io::stdin()), diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index da61db922c3..e60d64394e8 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -15,7 +15,7 @@ use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; use log::error; use mmio::{MMIODeviceManager, MmioError}; -use pci_mngr::{PciDevices, PciManagerError}; +use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; @@ -127,30 +127,39 @@ impl DeviceManager { Ok(serial) } + #[cfg(target_arch = "x86_64")] + fn create_legacy_devices( + event_manager: &mut EventManager, + vcpus_exit_evt: &EventFd, + vm: &Vm, + resource_allocator: &ResourceAllocator, + ) -> Result { + Self::set_stdout_nonblocking(); + + // Create serial device + let serial = Self::setup_serial_device(event_manager)?; + let reset_evt = vcpus_exit_evt + .try_clone() + .map_err(DeviceManagerCreateError::EventFd)?; + // Create keyboard emulator for reset event + let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); + + // create pio dev manager with legacy devices + let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; + Ok(legacy_devices) + } + #[cfg_attr(target_arch = "aarch64", allow(unused))] pub fn new( event_manager: &mut EventManager, - vcpu_exit_evt: &EventFd, + vcpus_exit_evt: &EventFd, vm: &Vm, ) -> Result { let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] - let legacy_devices = { - Self::set_stdout_nonblocking(); - - // Create serial device - let serial = Self::setup_serial_device(event_manager)?; - let reset_evt = vcpu_exit_evt - .try_clone() - .map_err(DeviceManagerCreateError::EventFd)?; - // Create keyboard emulator for reset event - let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); - - // create pio dev manager with legacy devices - let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; - legacy_devices - }; + let legacy_devices = + Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm, &resource_allocator)?; Ok(DeviceManager { resource_allocator, @@ -270,6 +279,8 @@ impl DeviceManager { #[derive(Debug, Default, Clone, Serialize, Deserialize)] /// State of devices in the system pub struct DevicesState { + /// Resource allocator state + pub resource_allocator_state: resources::ResourceAllocatorState, /// MMIO devices state pub mmio_state: persist::DeviceStates, /// ACPI devices state @@ -292,12 +303,15 @@ pub enum DevicePersistError { SerialRestore(#[from] EmulateSerialInitError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), + /// Error creating DeviceManager: {0} + DeviceManager(#[from] DeviceManagerCreateError), } pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm: &'a Vm, pub event_manager: &'a mut EventManager, + pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -315,15 +329,82 @@ impl std::fmt::Debug for DeviceRestoreArgs<'_> { } } -impl DeviceManager { - pub fn save(&self) -> DevicesState { +impl<'a> Persist<'a> for DeviceManager { + type State = DevicesState; + type ConstructorArgs = DeviceRestoreArgs<'a>; + type Error = DevicePersistError; + + fn save(&self) -> Self::State { DevicesState { + resource_allocator_state: self.resource_allocator.save(), mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), pci_state: self.pci_devices.save(), } } + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + // Safe to unwrap here. ResourceAllocator restoring cannot fail. + let resource_allocator = + Arc::new(ResourceAllocator::restore((), &state.resource_allocator_state).unwrap()); + + // Setup legacy devices in case of x86 + #[cfg(target_arch = "x86_64")] + let legacy_devices = Self::create_legacy_devices( + constructor_args.event_manager, + constructor_args.vcpus_exit_evt, + constructor_args.vm, + &resource_allocator, + )?; + + // Restore MMIO devices + let mmio_ctor_args = MMIODevManagerConstructorArgs { + mem: constructor_args.mem, + vm: constructor_args.vm, + event_manager: constructor_args.event_manager, + resource_allocator: &resource_allocator, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + }; + let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; + + // Restore ACPI devices + let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { + mem: constructor_args.mem, + resource_allocator: &resource_allocator, + vm: constructor_args.vm, + }; + let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; + acpi_devices.notify_vmgenid()?; + + // Restore PCI devices + let pci_ctor_args = PciDevicesConstructorArgs { + resource_allocator: &resource_allocator, + }; + let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; + + let device_manager = DeviceManager { + resource_allocator, + mmio_devices, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + pci_devices, + }; + + // Restore serial. + // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 + device_manager.emulate_serial_init()?; + + Ok(device_manager) + } +} + +impl DeviceManager { /// Sets RDA bit in serial console pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { // When restoring from a previously saved state, there is no serial @@ -361,43 +442,6 @@ impl DeviceManager { Ok(()) } } - - pub fn restore( - &mut self, - state: &DevicesState, - restore_args: DeviceRestoreArgs, - ) -> Result<(), DevicePersistError> { - // Restore MMIO devices - let mmio_ctor_args = MMIODevManagerConstructorArgs { - mem: restore_args.mem, - vm: restore_args.vm, - event_manager: restore_args.event_manager, - resource_allocator: &self.resource_allocator, - vm_resources: restore_args.vm_resources, - instance_id: restore_args.instance_id, - restored_from_file: restore_args.restored_from_file, - }; - self.mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; - - // Restore serial. - // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 - self.emulate_serial_init()?; - - // Restore ACPI devices - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: restore_args.mem, - resource_allocator: &self.resource_allocator, - vm: restore_args.vm, - }; - self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; - self.acpi_devices.notify_vmgenid()?; - - // Restore PCI devices - self.pci_devices - .restore(&state.pci_state, &self.resource_allocator)?; - - Ok(()) - } } #[cfg(test)] diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 686349858fb..70bb03388f6 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -16,6 +16,7 @@ use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::snapshot::Persist; use crate::vstate::vm::InterruptError; #[derive(Debug, Default)] @@ -65,24 +66,6 @@ impl PciDevices { Ok(()) } - pub fn save(&self) -> PciDevicesState { - PciDevicesState { - pci_enabled: self.pci_segment.is_some(), - } - } - - pub fn restore( - &mut self, - state: &PciDevicesState, - resource_allocator: &Arc, - ) -> Result<(), PciManagerError> { - if state.pci_enabled { - self.attach_pci_segment(resource_allocator)?; - } - - Ok(()) - } - fn register_bars_with_bus( resource_allocator: &ResourceAllocator, virtio_device: &Arc>, @@ -194,3 +177,33 @@ impl PciDevices { pub struct PciDevicesState { pci_enabled: bool, } + +#[derive(Debug)] +pub struct PciDevicesConstructorArgs<'a> { + pub resource_allocator: &'a Arc, +} + +impl<'a> Persist<'a> for PciDevices { + type State = PciDevicesState; + type ConstructorArgs = PciDevicesConstructorArgs<'a>; + type Error = PciManagerError; + + fn save(&self) -> Self::State { + PciDevicesState { + pci_enabled: self.pci_segment.is_some(), + } + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut pci_devices = PciDevices::new(); + + if state.pci_enabled { + pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + } + + Ok(pci_devices) + } +} diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 6b1168ec965..1952fdaee40 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -9,7 +9,6 @@ use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; use log::{error, warn}; use serde::{Deserialize, Serialize}; -use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; @@ -471,27 +470,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .map_err(|()| DevicePersistError::MmioTransport)?, )); - // We do not currently require exact re-allocation of IDs via - // `dev_manager.irq_allocator.allocate_id()` and currently cannot do - // this effectively as `IdAllocator` does not implement an exact - // match API. - // In the future we may require preserving `IdAllocator`'s state - // after snapshot restore so as to restore the exact interrupt IDs - // from the original device's state for implementing hot-plug. - // For now this is why we do not restore the state of the - // `IdAllocator` under `dev_manager`. - - constructor_args - .resource_allocator - .allocate_32bit_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - dev_manager.register_mmio_virtio( vm, id.clone(), @@ -678,6 +656,7 @@ mod tests { use super::*; use crate::builder::tests::*; + use crate::device_manager; use crate::devices::virtio::block::CacheType; use crate::resources::VmmConfig; use crate::snapshot::Snapshot; @@ -748,11 +727,10 @@ mod tests { #[test] fn test_device_manager_persistence() { - let mut buf = vec![0; 16384]; + let mut buf = vec![0; 65536]; // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. { @@ -812,7 +790,10 @@ mod tests { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let vmm = default_vmm(); - let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let resource_allocator = + ResourceAllocator::restore((), &device_manager_state.resource_allocator_state).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), @@ -824,7 +805,7 @@ mod tests { restored_from_file: true, }; let _restored_dev_manager = - MMIODeviceManager::restore(restore_args, &device_states).unwrap(); + MMIODeviceManager::restore(restore_args, &device_manager_state.mmio_state).unwrap(); let expected_vm_resources = format!( r#"{{ @@ -899,7 +880,10 @@ mod tests { .version(), MmdsVersion::V2 ); - assert_eq!(device_states.mmds_version.unwrap(), MmdsVersion::V2.into()); + assert_eq!( + device_manager_state.mmio_state.mmds_version.unwrap(), + MmdsVersion::V2.into() + ); assert_eq!( expected_vm_resources, serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 249d0507ba8..f7035e55566 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -1,14 +1,17 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::convert::Infallible; use std::sync::{Arc, Mutex}; use pci::DeviceRelocation; +use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; use vm_device::Bus; use crate::arch; +use crate::snapshot::Persist; /// A resource manager for (de)allocating interrupt lines (GSIs) and guest memory /// @@ -152,6 +155,69 @@ impl ResourceAllocator { } } +impl<'a> Persist<'a> for ResourceAllocator { + type State = ResourceAllocatorState; + type ConstructorArgs = (); + type Error = Infallible; + + fn save(&self) -> Self::State { + ResourceAllocatorState { + gsi_allocator: self.gsi_allocator.clone(), + mmio32_memory: self.mmio32_memory.clone(), + mmio64_memory: self.mmio64_memory.clone(), + system_memory: self.system_memory.clone(), + } + } + + fn restore( + _constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + Ok(ResourceAllocator { + gsi_allocator: state.gsi_allocator.clone(), + mmio32_memory: state.mmio32_memory.clone(), + mmio64_memory: state.mmio64_memory.clone(), + system_memory: state.system_memory.clone(), + mmio_bus: Arc::new(Bus::new()), + #[cfg(target_arch = "x86_64")] + pio_bus: Arc::new(Bus::new()), + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ResourceAllocatorState { + // Allocator for device interrupt lines + pub gsi_allocator: Arc>, + // Allocator for memory in the 32-bit MMIO address space + pub mmio32_memory: Arc>, + // Allocator for memory in the 64-bit MMIO address space + pub mmio64_memory: Arc>, + // Memory allocator for system data + pub system_memory: Arc>, +} + +impl Default for ResourceAllocatorState { + fn default() -> Self { + Self { + gsi_allocator: Arc::new(Mutex::new( + IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), + )), + mmio32_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE) + .unwrap(), + )), + mmio64_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE) + .unwrap(), + )), + system_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE).unwrap(), + )), + } + } +} + impl DeviceRelocation for ResourceAllocator { fn move_bar( &self, @@ -167,8 +233,11 @@ impl DeviceRelocation for ResourceAllocator { #[cfg(test)] mod tests { - use super::ResourceAllocator; - use crate::arch; + use vm_allocator::AllocPolicy; + + use super::{ResourceAllocator, ResourceAllocatorState}; + use crate::arch::{self, IRQ_BASE}; + use crate::snapshot::{Persist, Snapshot}; const MAX_IRQS: u32 = arch::IRQ_MAX - arch::IRQ_BASE + 1; @@ -210,4 +279,61 @@ mod tests { assert_eq!(allocator.allocate_gsi(1), Ok(vec![i])); } } + + fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { + let mut buf = vec![0u8; 1024]; + Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); + let restored_state: ResourceAllocatorState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + ResourceAllocator::restore((), &restored_state).unwrap() + } + + #[test] + fn test_save_restore() { + let allocator0 = ResourceAllocator::new().unwrap(); + let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_0, IRQ_BASE); + + let allocator1 = clone_allocator(&allocator0); + let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_1, IRQ_BASE + 1); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START); + + let allocator2 = clone_allocator(&allocator1); + allocator2 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) + .unwrap_err(); + allocator2 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio64_mem)) + .unwrap_err(); + allocator2 + .allocate_system_memory(0x42, 1, AllocPolicy::ExactMatch(system_mem)) + .unwrap_err(); + + let gsi_2 = allocator2.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_2, IRQ_BASE + 2); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START + 0x42); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START + 0x42); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START + 0x42); + } } diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index df0656bfbcc..0cf0ae0d7b1 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -152,11 +152,6 @@ impl<'a> Persist<'a> for VmGenId { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - constructor_args.resource_allocator.allocate_system_memory( - VMGENID_MEM_SIZE, - 8, - vm_allocator::AllocPolicy::ExactMatch(state.addr), - )?; Self::from_parts(GuestAddress(state.addr), state.gsi, constructor_args.mem) } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 18177367ada..b3efc12a500 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -126,6 +126,7 @@ use devices::acpi::vmgenid::VmGenIdError; use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; +use snapshot::Persist; use userfaultfd::Uffd; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 6fd5ca89081..b54cb2608ca 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -599,6 +599,7 @@ mod tests { #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::devices::virtio::block::CacheType; + use crate::snapshot::Persist; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; From 9534b279b8d0372c9f843c0da1d4ae1e254ac55c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 6 Jun 2025 13:36:38 +0200 Subject: [PATCH 17/27] refactor: VirtIO MMIO persistence logic VirtIO MMIO restore logic activates the device the moment we restore the device state, if the device was activated when snapshotted. Move the activation responsibility to the logic the restores the MMIO transport. The reason for this change is that that's how it will be done for the PCI transport. Unifying this will allow us reusing the same types for restoring the non-transport state of devices. Note that we needed to change the way Net devices are saved/restored. RxBuffer type of Net devices holds RX descriptors that we have parsed from the Queue ahead of time. The way we restored this info was manipulating the queue to re-parse the RX descriptors during the restore phase. However, we need the device to be activated to do so, which now isn't. So, instead of storing this info inside the snapshot make sure we have flushed everything before taking the snapshot. Also, simplify a bit the types that we use for serializing/deserializing the state of a device. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mmio.rs | 80 -------- src/vmm/src/device_manager/mod.rs | 109 +++++++++- src/vmm/src/device_manager/persist.rs | 191 ++++++------------ src/vmm/src/devices/virtio/balloon/persist.rs | 36 ++-- src/vmm/src/devices/virtio/block/persist.rs | 10 +- .../devices/virtio/block/virtio/persist.rs | 18 +- src/vmm/src/devices/virtio/net/device.rs | 21 ++ src/vmm/src/devices/virtio/net/persist.rs | 47 +---- src/vmm/src/devices/virtio/rng/persist.rs | 16 +- src/vmm/src/devices/virtio/vsock/persist.rs | 14 +- src/vmm/src/lib.rs | 2 +- src/vmm/src/persist.rs | 15 +- 12 files changed, 224 insertions(+), 335 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 2d4432470d1..2d6cde39c52 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -16,7 +16,6 @@ use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; -use log::info; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; @@ -28,14 +27,8 @@ use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::MmioTransport; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; @@ -442,79 +435,6 @@ impl MMIODeviceManager { Ok(()) } - /// Artificially kick devices as if they had external events. - pub fn kick_devices(&self) { - info!("Artificially kick devices."); - // We only kick virtio devices for now. - let _: Result<(), MmioError> = self.for_each_virtio_device(|virtio_type, id, device| { - let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); - let mut virtio = mmio_transport_locked.locked_device(); - match *virtio_type { - TYPE_BALLOON => { - let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", id); - balloon.process_virtio_queues(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = virtio.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", id); - block.process_virtio_queues(); - } - } - } - TYPE_NET => { - let net = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", id); - net.process_virtio_queues(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = virtio - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {id}."); - vsock.signal_used_queue(0).unwrap(); - } - } - TYPE_RNG => { - let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {id}."); - entropy.process_virtio_queues(); - } - } - _ => (), - } - Ok(()) - }); - } - #[cfg(target_arch = "aarch64")] pub fn virtio_device_info(&self) -> Vec<&MMIODeviceInfo> { let mut device_info = Vec::new(); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index e60d64394e8..df290445787 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -5,6 +5,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::convert::Infallible; use std::fmt::Debug; use std::sync::{Arc, Mutex}; @@ -13,7 +14,7 @@ use event_manager::{MutEventSubscriber, SubscriberOps}; #[cfg(target_arch = "x86_64")] use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; -use log::error; +use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; @@ -30,8 +31,14 @@ use crate::devices::legacy::RTCDevice; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; use crate::devices::pseudo::BootTimer; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -274,6 +281,106 @@ impl DeviceManager { self.pci_devices .attach_pci_segment(&self.resource_allocator) } + + fn do_kick_device(virtio_device: Arc>) { + let mut device = virtio_device.lock().expect("Poisoned lock"); + match device.device_type() { + TYPE_BALLOON => { + let balloon = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if balloon.is_activated() { + info!("kick balloon {}.", balloon.id()); + balloon.process_virtio_queues(); + } + } + TYPE_BLOCK => { + // We only care about kicking virtio block. + // If we need to kick vhost-user-block we can do nothing. + if let Some(block) = device.as_mut_any().downcast_mut::() { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if block.is_activated() { + info!("kick block {}.", block.id()); + block.process_virtio_queues(); + } + } + } + TYPE_NET => { + let net = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if net.is_activated() { + info!("kick net {}.", net.id()); + net.process_virtio_queues(); + } + } + TYPE_VSOCK => { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = device + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock {}.", vsock.id()); + vsock.signal_used_queue(0).unwrap(); + } + } + TYPE_RNG => { + let entropy = device.as_mut_any().downcast_mut::().unwrap(); + if entropy.is_activated() { + info!("kick entropy {}.", entropy.id()); + entropy.process_virtio_queues(); + } + } + _ => (), + } + } + + /// Artificially kick VirtIO devices as if they had external events. + pub fn kick_virtio_devices(&self) { + info!("Artificially kick devices"); + // Go through MMIO VirtIO devices + let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + Self::do_kick_device(mmio_transport_locked.device()); + Ok(()) + }); + } + + fn do_mark_virtio_queue_memory_dirty( + device: Arc>, + mem: &GuestMemoryMmap, + ) { + // SAFETY: + // This should never fail as we mark pages only if device has already been activated, + // and the address validation was already performed on device activation. + let locked_device = device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + locked_device.mark_queue_memory_dirty(mem).unwrap() + } + } + + /// Mark queue memory dirty for activated VirtIO devices + pub fn mark_virtio_queue_memory_dirty(&self, mem: &GuestMemoryMmap) { + // Go through MMIO VirtIO devices + let _: Result<(), Infallible> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); + Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); + Ok(()) + }); + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 1952fdaee40..93385805e7b 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -42,7 +42,7 @@ use crate::devices::virtio::vsock::persist::{ use crate::devices::virtio::vsock::{ TYPE_VSOCK, Vsock, VsockError, VsockUnixBackend, VsockUnixBackendError, }; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::devices::virtio::{ActivateError, TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::mmds::data_store::MmdsVersion; use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; @@ -78,67 +78,17 @@ pub enum DevicePersistError { Entropy(#[from] EntropyError), /// Resource misconfiguration: {0}. Is the snapshot file corrupted? ResourcesError(#[from] ResourcesError), + /// Could not activate device: {0} + DeviceActivation(#[from] ActivateError), } -/// Holds the state of a balloon device connected to the MMIO space. +/// Holds the state of a MMIO VirtIO device #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBalloonState { +pub struct VirtioDeviceState { /// Device identifier. pub device_id: String, /// Device state. - pub device_state: BalloonState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a virtio block device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBlockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: BlockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a net device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedNetState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: NetState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a vsock device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedVsockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: VsockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of an entropy device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedEntropyState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: EntropyState, + pub device_state: T, /// Mmio transport state. pub transport_state: MmioTransportState, /// VmmResources. @@ -187,17 +137,17 @@ pub struct DeviceStates { // State of legacy devices in MMIO space. pub legacy_devices: Vec, /// Block device states. - pub block_devices: Vec, + pub block_devices: Vec>, /// Net device states. - pub net_devices: Vec, + pub net_devices: Vec>, /// Vsock device state. - pub vsock_device: Option, + pub vsock_device: Option>, /// Balloon device state. - pub balloon_device: Option, + pub balloon_device: Option>, /// Mmds version. pub mmds_version: Option, /// Entropy device state. - pub entropy_device: Option, + pub entropy_device: Option>, } /// A type used to extract the concrete `Arc>` for each of the device @@ -311,20 +261,22 @@ impl<'a> Persist<'a> for MMIODeviceManager { let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); let transport_state = mmio_transport_locked.save(); + let device_info = device.resources; + let device_id = devid.clone(); let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { TYPE_BALLOON => { - let balloon_state = locked_device + let device_state = locked_device .as_any() .downcast_ref::() .unwrap() .save(); - states.balloon_device = Some(ConnectedBalloonState { - device_id: devid.clone(), - device_state: balloon_state, + states.balloon_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } // Both virtio-block and vhost-user-block share same device type. @@ -337,16 +289,17 @@ impl<'a> Persist<'a> for MMIODeviceManager { ); } else { block.prepare_save(); - states.block_devices.push(ConnectedBlockState { - device_id: devid.clone(), - device_state: block.save(), + let device_state = block.save(); + states.block_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, - }) + device_info, + }); } } TYPE_NET => { - let net = locked_device.as_any().downcast_ref::().unwrap(); + let net = locked_device.as_mut_any().downcast_mut::().unwrap(); if let (Some(mmds_ns), None) = (net.mmds_ns.as_ref(), states.mmds_version.as_ref()) { @@ -354,11 +307,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { Some(mmds_ns.mmds.lock().expect("Poisoned lock").version().into()); } - states.net_devices.push(ConnectedNetState { - device_id: devid.clone(), - device_state: net.save(), + net.prepare_save(); + let device_state = net.save(); + states.net_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_VSOCK => { @@ -378,16 +333,16 @@ impl<'a> Persist<'a> for MMIODeviceManager { // Save state after potential notification to the guest. This // way we save changes to the queue the notification can cause. - let vsock_state = VsockState { + let device_state = VsockState { backend: vsock.backend().save(), frontend: vsock.save(), }; - states.vsock_device = Some(ConnectedVsockState { - device_id: devid.clone(), - device_state: vsock_state, + states.vsock_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_RNG => { @@ -395,12 +350,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { .as_mut_any() .downcast_mut::() .unwrap(); + let device_state = entropy.save(); - states.entropy_device = Some(ConnectedEntropyState { - device_id: devid.clone(), - device_state: entropy.save(), + states.entropy_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } _ => unreachable!(), @@ -450,19 +406,20 @@ impl<'a> Persist<'a> for MMIODeviceManager { } let mut restore_helper = |device: Arc>, + activated: bool, is_vhost_user: bool, as_subscriber: Arc>, id: &String, state: &MmioTransportState, - interrupt: Arc, device_info: &MMIODeviceInfo, mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { + let interrupt = Arc::new(IrqTrigger::new()); let restore_args = MmioTransportConstructorArgs { mem: mem.clone(), - interrupt, - device, + interrupt: interrupt.clone(), + device: device.clone(), is_vhost_user, }; let mmio_transport = Arc::new(Mutex::new( @@ -480,16 +437,21 @@ impl<'a> Persist<'a> for MMIODeviceManager { }, )?; + if activated { + device + .lock() + .expect("Poisoned lock") + .activate(mem.clone(), interrupt)?; + } + event_manager.add_subscriber(as_subscriber); Ok(()) }; if let Some(balloon_state) = &state.balloon_device { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Balloon::restore( BalloonConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), restored_from_file: constructor_args.restored_from_file, }, &balloon_state.device_state, @@ -501,11 +463,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + balloon_state.device_state.virtio_state.activated, false, device, &balloon_state.device_id, &balloon_state.transport_state, - interrupt, &balloon_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -513,12 +475,8 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for block_state in &state.block_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Block::restore( - BlockConstructorArgs { - mem: mem.clone(), - interrupt: interrupt.clone(), - }, + BlockConstructorArgs { mem: mem.clone() }, &block_state.device_state, )?)); @@ -528,11 +486,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + block_state.device_state.is_activated(), false, device, &block_state.device_id, &block_state.transport_state, - interrupt, &block_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -556,11 +514,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for net_state in &state.net_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Net::restore( NetConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), mmds: constructor_args .vm_resources .mmds @@ -577,11 +533,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + net_state.device_state.virtio_state.activated, false, device, &net_state.device_id, &net_state.transport_state, - interrupt, &net_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -593,11 +549,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { cid: vsock_state.device_state.frontend.cid, }; let backend = VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend)?; - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Vsock::restore( VsockConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), backend, }, &vsock_state.device_state.frontend, @@ -609,11 +563,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + vsock_state.device_state.frontend.virtio_state.activated, false, device, &vsock_state.device_id, &vsock_state.transport_state, - interrupt, &vsock_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -621,8 +575,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { } if let Some(entropy_state) = &state.entropy_device { - let interrupt = Arc::new(IrqTrigger::new()); - let ctor_args = EntropyConstructorArgs::new(mem.clone(), interrupt.clone()); + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; let device = Arc::new(Mutex::new(Entropy::restore( ctor_args, @@ -635,11 +588,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + entropy_state.device_state.virtio_state.activated, false, device, &entropy_state.device_id, &entropy_state.transport_state, - interrupt, &entropy_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -665,29 +618,8 @@ mod tests { use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::VsockDeviceConfig; - impl PartialEq for ConnectedBalloonState { - fn eq(&self, other: &ConnectedBalloonState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedBlockState { - fn eq(&self, other: &ConnectedBlockState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedNetState { - fn eq(&self, other: &ConnectedNetState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedVsockState { - fn eq(&self, other: &ConnectedVsockState) -> bool { + impl PartialEq for VirtioDeviceState { + fn eq(&self, other: &VirtioDeviceState) -> bool { // Actual device state equality is checked by the device's tests. self.transport_state == other.transport_state && self.device_info == other.device_info } @@ -699,6 +631,7 @@ mod tests { && self.block_devices == other.block_devices && self.net_devices == other.net_devices && self.vsock_device == other.vsock_device + && self.entropy_device == other.entropy_device } } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index a6634d07170..15ae1e26b9e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -87,7 +87,7 @@ pub struct BalloonState { stats_desc_index: Option, latest_stats: BalloonStatsState, config_space: BalloonConfigSpaceState, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -95,8 +95,6 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt used from the device. - pub interrupt: Arc, pub restored_from_file: bool, } @@ -154,25 +152,18 @@ impl Persist<'_> for Balloon { actual_pages: state.config_space.actual_pages, }; - if state.virtio_state.activated { - balloon.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - if balloon.stats_enabled() { - // Restore the stats descriptor. - balloon.set_stats_desc_index(state.stats_desc_index); - - // Restart timer if needed. - let timer_state = TimerState::Periodic { - current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - }; - balloon - .stats_timer - .set_state(timer_state, SetTimeFlags::Default); - } + if state.virtio_state.activated && balloon.stats_enabled() { + // Restore the stats descriptor. + balloon.set_stats_desc_index(state.stats_desc_index); + + // Restart timer if needed. + let timer_state = TimerState::Periodic { + current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + }; + balloon + .stats_timer + .set_state(timer_state, SetTimeFlags::Default); } Ok(balloon) @@ -202,7 +193,6 @@ mod tests { let restored_balloon = Balloon::restore( BalloonConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), restored_from_file: true, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index 57712a8fb3a..cb9a6471137 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -17,9 +17,17 @@ pub enum BlockState { VhostUser(VhostUserBlockState), } +impl BlockState { + pub fn is_activated(&self) -> bool { + match self { + BlockState::Virtio(virtio_block_state) => virtio_block_state.virtio_state.activated, + BlockState::VhostUser(vhost_user_block_state) => false, + } + } +} + /// Auxiliary structure for creating a device when resuming from a snapshot. #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, - pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 57e4a11b9c1..1c7a1bce106 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -58,7 +58,7 @@ pub struct VirtioBlockState { cache_type: CacheType, root_device: bool, disk_path: String, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, file_engine_type: FileEngineTypeState, } @@ -111,15 +111,6 @@ impl Persist<'_> for VirtioBlock { let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; - let device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; - let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), }; @@ -132,7 +123,7 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, - device_state, + device_state: DeviceState::Inactive, id: state.id.clone(), partuuid: state.partuuid.clone(), @@ -227,10 +218,7 @@ mod tests { // Restore the block device. let restored_block = VirtioBlock::restore( - BlockConstructorArgs { - mem: guest_mem, - interrupt: default_interrupt(), - }, + BlockConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index e8c0135263c..bf7a91e21f3 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -8,6 +8,7 @@ use std::collections::VecDeque; use std::mem::{self}; use std::net::Ipv4Addr; +use std::num::Wrapping; use std::ops::Deref; use std::sync::{Arc, Mutex}; @@ -930,6 +931,26 @@ impl Net { let _ = self.resume_rx(); let _ = self.process_tx(); } + + /// Prepare saving state + pub fn prepare_save(&mut self) { + // We shouldn't be messing with the queue if the device is not activated. + // Anyways, if it isn't there's nothing to prepare; we haven't parsed any + // descriptors yet from it and we can't have a deferred frame. + if !self.is_activated() { + return; + } + + // Give potential deferred RX frame to guest + self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]); + // Reset the parsed available descriptors, so we will re-parse them + self.queues[RX_INDEX].next_avail -= + Wrapping(u16::try_from(self.rx_buffer.parsed_descriptors.len()).unwrap()); + self.rx_buffer.parsed_descriptors.clear(); + self.rx_buffer.iovec.clear(); + self.rx_buffer.used_bytes = 0; + self.rx_buffer.used_descriptors = 0; + } } impl VirtioDevice for Net { diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 961b56556c8..6ef8ad842ac 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -30,27 +30,6 @@ pub struct NetConfigSpaceState { guest_mac: Option, } -/// Information about the parsed RX buffers -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct RxBufferState { - // Number of iovecs we have parsed from the guest - parsed_descriptor_chains_nr: u16, - // Number of used descriptors - used_descriptors: u16, - // Number of used bytes - used_bytes: u32, -} - -impl RxBufferState { - fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self { - RxBufferState { - parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(), - used_descriptors: rx_buffer.used_descriptors, - used_bytes: rx_buffer.used_bytes, - } - } -} - /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -62,8 +41,7 @@ pub struct NetState { /// The associated MMDS network stack. pub mmds_ns: Option, config_space: NetConfigSpaceState, - virtio_state: VirtioDeviceState, - rx_buffers_state: RxBufferState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -71,8 +49,6 @@ pub struct NetState { pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt for the device. - pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } @@ -108,7 +84,6 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), - rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer), } } @@ -153,25 +128,6 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; - if state.virtio_state.activated { - let supported_flags: u32 = Net::build_tap_offload_features(net.acked_features); - net.tap - .set_offload(supported_flags) - .map_err(NetPersistError::TapSetOffload)?; - - net.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily - // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. - net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; - net.parse_rx_descriptors(); - net.rx_buffer.used_descriptors = state.rx_buffers_state.used_descriptors; - net.rx_buffer.used_bytes = state.rx_buffers_state.used_bytes; - } - Ok(net) } } @@ -215,7 +171,6 @@ mod tests { match Net::restore( NetConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), mmds: mmds_ds, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 75db947c9c7..d266e259418 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -19,20 +19,13 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, } #[derive(Debug)] pub struct EntropyConstructorArgs { - mem: GuestMemoryMmap, - interrupt: Arc, -} - -impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { - Self { mem, interrupt } - } + pub mem: GuestMemoryMmap, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -72,9 +65,6 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - if state.virtio_state.activated { - entropy.set_activated(constructor_args.mem, constructor_args.interrupt); - } Ok(entropy) } @@ -99,7 +89,7 @@ mod tests { let guest_mem = create_virtio_mem(); let restored = Entropy::restore( - EntropyConstructorArgs::new(guest_mem, default_interrupt()), + EntropyConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index 9d2fd61d9d5..6775707da3e 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -31,7 +31,7 @@ pub struct VsockState { pub struct VsockFrontendState { /// Context Identifier. pub cid: u64, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// An enum for the serializable backend state types. @@ -53,8 +53,6 @@ pub struct VsockUdsState { pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt to use for the device. - pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } @@ -123,14 +121,7 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; + vsock.device_state = DeviceState::Inactive; Ok(vsock) } } @@ -193,7 +184,6 @@ pub(crate) mod tests { let mut restored_device = Vsock::restore( VsockConstructorArgs { mem: ctx.mem.clone(), - interrupt: default_interrupt(), backend: match restored_state.backend { VsockBackendState::Uds(uds_state) => { assert_eq!(uds_state.path, "test".to_owned()); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index b3efc12a500..d4cb5a78344 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -390,7 +390,7 @@ impl Vmm { /// Sends a resume command to the vCPUs. pub fn resume_vm(&mut self) -> Result<(), VmmError> { - self.device_manager.mmio_devices.kick_devices(); + self.device_manager.kick_virtio_devices(); // Send the events. self.vcpus_handles diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index b54cb2608ca..067d1083853 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -166,21 +166,8 @@ pub fn create_snapshot( // We need to mark queues as dirty again for all activated devices. The reason we // do it here is because we don't mark pages as dirty during runtime // for queue objects. - // SAFETY: - // This should never fail as we only mark pages only if device has already been activated, - // and the address validation was already performed on device activation. vmm.device_manager - .mmio_devices - .for_each_virtio_device(|_, _, device| { - let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); - let d = mmio_dev_locked.locked_device(); - if d.is_activated() { - d.mark_queue_memory_dirty(vmm.vm.guest_memory()) - } else { - Ok(()) - } - }) - .unwrap(); + .mark_virtio_queue_memory_dirty(vmm.vm.guest_memory()); Ok(()) } From c979c156f42d41f887fe0d70811638e35d6652b4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Jun 2025 11:39:21 +0200 Subject: [PATCH 18/27] pci: support snapshotting VirtIO PCI devices Support serializing the device-specific and transport state of a VirtIO device that uses the PCI transport. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 4 +- src/vmm/src/device_manager/mod.rs | 19 +- src/vmm/src/device_manager/pci_mngr.rs | 421 +++++++++++++++++- .../virtio/transport/pci/common_config.rs | 2 +- .../devices/virtio/transport/pci/device.rs | 173 ++++--- 5 files changed, 551 insertions(+), 68 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 3284b11f559..d8d69a97314 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -451,6 +451,8 @@ pub fn build_microvm_from_snapshot( // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; + let vm = Arc::new(vm); + // Restore devices states. // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise @@ -473,7 +475,7 @@ pub fn build_microvm_from_snapshot( instance_info: instance_info.clone(), shutdown_exit_code: None, kvm, - vm: Arc::new(vm), + vm, uffd, vcpus_handles: Vec::new(), vcpus_exit_evt, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index df290445787..d3770b62109 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -357,6 +357,11 @@ impl DeviceManager { Self::do_kick_device(mmio_transport_locked.device()); Ok(()) }); + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_kick_device(virtio_device); + } } fn do_mark_virtio_queue_memory_dirty( @@ -380,6 +385,12 @@ impl DeviceManager { Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); Ok(()) }); + + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); + } } } @@ -416,7 +427,7 @@ pub enum DevicePersistError { pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a Vm, + pub vm: &'a Arc, pub event_manager: &'a mut EventManager, pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, @@ -491,6 +502,12 @@ impl<'a> Persist<'a> for DeviceManager { // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { resource_allocator: &resource_allocator, + vm: constructor_args.vm.clone(), + mem: constructor_args.mem, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + event_manager: constructor_args.event_manager, }; let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 70bb03388f6..26a44dd29c9 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -5,19 +5,37 @@ use std::collections::HashMap; use std::fmt::Debug; use std::sync::{Arc, Mutex}; -use event_manager::MutEventSubscriber; -use log::debug; +use event_manager::{MutEventSubscriber, SubscriberOps}; +use log::{debug, error, warn}; use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; use serde::{Deserialize, Serialize}; use vm_device::BusError; -use crate::Vm; +use super::persist::{MmdsVersionState, SharedDeviceType}; use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; +use crate::devices::virtio::block::device::Block; +use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::net::persist::{NetConstructorArgs, NetState}; +use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::rng::persist::{EntropyConstructorArgs, EntropyState}; +use crate::devices::virtio::transport::pci::device::{ + VirtioPciDevice, VirtioPciDeviceError, VirtioPciDeviceState, +}; +use crate::devices::virtio::vsock::persist::{ + VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, +}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::resources::VmResources; use crate::snapshot::Persist; -use crate::vstate::vm::InterruptError; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; +use crate::{EventManager, Vm}; #[derive(Debug, Default)] pub struct PciDevices { @@ -119,6 +137,7 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration let msix_num = @@ -153,6 +172,9 @@ impl PciDevices { .expect("Poisoned lock") .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + self.virtio_devices + .insert((device_type, id.clone()), virtio_device.clone()); + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; virtio_device .lock() @@ -162,6 +184,54 @@ impl PciDevices { Ok(()) } + fn restore_pci_device( + &mut self, + vm: &Arc, + resource_allocator: &ResourceAllocator, + device: Arc>, + device_id: &str, + transport_state: &VirtioPciDeviceState, + event_manager: &mut EventManager, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let msi_vector_group = Arc::new(MsiVectorGroup::restore( + vm.clone(), + &transport_state.msi_vector_group, + )?); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); + + let virtio_device = Arc::new(Mutex::new(VirtioPciDevice::new_from_state( + device_id.to_string(), + vm.guest_memory().clone(), + device.clone(), + msi_vector_group, + transport_state.clone(), + )?)); + + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device( + transport_state.pci_device_bdf.device() as u32, + virtio_device.clone(), + )?; + + self.virtio_devices + .insert((device_type, device_id.to_string()), virtio_device.clone()); + + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + event_manager.add_subscriber(device); + + Ok(()) + } + /// Gets the specified device. pub fn get_virtio_device( &self, @@ -173,14 +243,57 @@ impl PciDevices { } } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioDeviceState { + /// Device identifier + pub device_id: String, + /// Device BDF + pub pci_device_bdf: u32, + /// Device state + pub device_state: T, + /// Transport state + pub transport_state: VirtioPciDeviceState, +} + #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct PciDevicesState { - pci_enabled: bool, + /// Whether PCI is enabled + pub pci_enabled: bool, + /// Block device states. + pub block_devices: Vec>, + /// Net device states. + pub net_devices: Vec>, + /// Vsock device state. + pub vsock_device: Option>, + /// Balloon device state. + pub balloon_device: Option>, + /// Mmds version. + pub mmds_version: Option, + /// Entropy device state. + pub entropy_device: Option>, } -#[derive(Debug)] pub struct PciDevicesConstructorArgs<'a> { + pub vm: Arc, + pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a Arc, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, + pub event_manager: &'a mut EventManager, +} + +impl<'a> Debug for PciDevicesConstructorArgs<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciDevicesConstructorArgs") + .field("vm", &self.vm) + .field("mem", &self.mem) + .field("resource_allocator", &self.resource_allocator) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } } impl<'a> Persist<'a> for PciDevices { @@ -189,19 +302,305 @@ impl<'a> Persist<'a> for PciDevices { type Error = PciManagerError; fn save(&self) -> Self::State { - PciDevicesState { - pci_enabled: self.pci_segment.is_some(), + let mut state = PciDevicesState::default(); + if self.pci_segment.is_some() { + state.pci_enabled = true; + } else { + return state; } + + for pci_dev in self.virtio_devices.values() { + let locked_pci_dev = pci_dev.lock().expect("Poisoned lock"); + let transport_state = locked_pci_dev.state(); + let virtio_dev = locked_pci_dev.virtio_device(); + let mut locked_virtio_dev = virtio_dev.lock().expect("Poisoned lock"); + + let pci_device_bdf = transport_state.pci_device_bdf.into(); + + match locked_virtio_dev.device_type() { + TYPE_BALLOON => { + let balloon_device = locked_virtio_dev + .as_any() + .downcast_ref::() + .unwrap(); + + let device_state = balloon_device.save(); + + state.balloon_device = Some(VirtioDeviceState { + device_id: balloon_device.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + TYPE_BLOCK => { + let block_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if block_dev.is_vhost_user() { + warn!( + "Skipping vhost-user-block device. VhostUserBlock does not support \ + snapshotting yet" + ); + } else { + block_dev.prepare_save(); + let device_state = block_dev.save(); + state.block_devices.push(VirtioDeviceState { + device_id: block_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + } + TYPE_NET => { + let net_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if let (Some(mmds_ns), None) = + (net_dev.mmds_ns.as_ref(), state.mmds_version.as_ref()) + { + state.mmds_version = + Some(mmds_ns.mmds.lock().expect("Poisoned lock").version().into()); + } + net_dev.prepare_save(); + let device_state = net_dev.save(); + + state.net_devices.push(VirtioDeviceState { + device_id: net_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + TYPE_VSOCK => { + let vsock_dev = locked_virtio_dev + .as_mut_any() + // Currently, VsockUnixBackend is the only implementation of VsockBackend. + .downcast_mut::>() + .unwrap(); + + // Send Transport event to reset connections if device + // is activated. + if vsock_dev.is_activated() { + vsock_dev + .send_transport_reset_event() + .unwrap_or_else(|err| { + error!("Failed to send reset transport event: {:?}", err); + }); + } + + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock_dev.backend().save(), + frontend: vsock_dev.save(), + }; + + state.vsock_device = Some(VirtioDeviceState { + device_id: vsock_dev.id().to_string(), + pci_device_bdf, + device_state: vsock_state, + transport_state, + }); + } + TYPE_RNG => { + let rng_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + let device_state = rng_dev.save(); + + state.entropy_device = Some(VirtioDeviceState { + device_id: rng_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + _ => unreachable!(), + } + } + + state } fn restore( constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { + let mem = constructor_args.mem; let mut pci_devices = PciDevices::new(); + if !state.pci_enabled { + return Ok(pci_devices); + } + + pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + + if let Some(balloon_state) = &state.balloon_device { + let device = Arc::new(Mutex::new( + Balloon::restore( + BalloonConstructorArgs { + mem: mem.clone(), + restored_from_file: constructor_args.restored_from_file, + }, + &balloon_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Balloon(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &balloon_state.device_id, + &balloon_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + for block_state in &state.block_devices { + let device = Arc::new(Mutex::new( + Block::restore( + BlockConstructorArgs { mem: mem.clone() }, + &block_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::VirtioBlock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &block_state.device_id, + &block_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + // If the snapshot has the mmds version persisted, initialise the data store with it. + if let Some(mmds_version) = &state.mmds_version { + constructor_args + .vm_resources + .set_mmds_version(mmds_version.clone().into(), constructor_args.instance_id) + .unwrap(); + } else if state + .net_devices + .iter() + .any(|dev| dev.device_state.mmds_ns.is_some()) + { + // If there's at least one network device having an mmds_ns, it means + // that we are restoring from a version that did not persist the `MmdsVersionState`. + // Init with the default. + constructor_args.vm_resources.mmds_or_default(); + } + + for net_state in &state.net_devices { + let device = Arc::new(Mutex::new( + Net::restore( + NetConstructorArgs { + mem: mem.clone(), + mmds: constructor_args + .vm_resources + .mmds + .as_ref() + // Clone the Arc reference. + .cloned(), + }, + &net_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Network(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &net_state.device_id, + &net_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(vsock_state) = &state.vsock_device { + let ctor_args = VsockUdsConstructorArgs { + cid: vsock_state.device_state.frontend.cid, + }; + let backend = + VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend).unwrap(); + let device = Arc::new(Mutex::new( + Vsock::restore( + VsockConstructorArgs { + mem: mem.clone(), + backend, + }, + &vsock_state.device_state.frontend, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Vsock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &vsock_state.device_id, + &vsock_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(entropy_state) = &state.entropy_device { + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; + + let device = Arc::new(Mutex::new( + Entropy::restore(ctor_args, &entropy_state.device_state).unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Entropy(device.clone())) + .unwrap(); - if state.pci_enabled { - pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &entropy_state.device_id, + &entropy_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() } Ok(pci_devices) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index c8ee2d1d2a9..6e52a1ca007 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -144,7 +144,7 @@ impl VirtioPciCommonConfig { } } - fn state(&self) -> VirtioPciCommonConfigState { + pub fn state(&self) -> VirtioPciCommonConfigState { VirtioPciCommonConfigState { driver_status: self.driver_status, config_generation: self.config_generation, diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 20c169297fd..6793d502f00 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -9,6 +9,7 @@ use std::any::Any; use std::cmp; +use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::io::Write; use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; @@ -41,6 +42,7 @@ use crate::devices::virtio::transport::pci::common_config::{ use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; use crate::logger::{debug, error}; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; @@ -283,8 +285,8 @@ const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. -#[derive(Debug, Serialize, Deserialize)] -struct QueueState { +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueueState { max_size: u16, size: u16, ready: bool, @@ -293,14 +295,18 @@ struct QueueState { used_ring: u64, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct VirtioPciDeviceState { pub pci_device_bdf: PciBdf, - device_activated: bool, - queues: Vec, - interrupt_status: usize, - cap_pci_cfg_offset: usize, - cap_pci_cfg: Vec, + pub device_activated: bool, + pub interrupt_status: usize, + pub cap_pci_cfg_offset: usize, + pub cap_pci_cfg: Vec, + pub pci_configuration_state: PciConfigurationState, + pub pci_dev_state: VirtioPciCommonConfigState, + pub msix_state: MsixConfigState, + pub msi_vector_group: HashMap, + pub bar_configuration: Vec, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -337,7 +343,7 @@ pub struct VirtioPciDevice { // PCI interrupts. interrupt_status: Arc, virtio_interrupt: Option>, - interrupt_source_group: Arc, + interrupt_source_group: Arc, // Guest memory memory: GuestMemoryMmap, @@ -421,7 +427,6 @@ impl VirtioPciDevice { } /// Constructs a new PCI transport for the given virtio device. - #[allow(clippy::too_many_arguments)] pub fn new( id: String, memory: GuestMemoryMmap, @@ -464,7 +469,7 @@ impl VirtioPciDevice { device, device_activated: Arc::new(AtomicBool::new(false)), interrupt_status: Arc::new(AtomicUsize::new(0)), - virtio_interrupt: None, + virtio_interrupt: Some(interrupt), memory, settings_bar: 0, use_64bit_bar: true, @@ -476,6 +481,70 @@ impl VirtioPciDevice { Ok(virtio_pci_device) } + pub fn new_from_state( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + state: VirtioPciDeviceState, + ) -> Result { + let msix_config = Self::msix_config( + state.pci_device_bdf.into(), + msi_vectors.clone(), + Some(state.msix_state), + )?; + + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + Some(state.pci_configuration_state), + ); + let virtio_common_config = VirtioPciCommonConfig::new(state.pci_dev_state); + let cap_pci_cfg_info = VirtioPciCfgCapInfo { + offset: state.cap_pci_cfg_offset, + cap: *VirtioPciCfgCap::from_slice(&state.cap_pci_cfg).unwrap(), + }; + + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: state.pci_device_bdf, + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(state.device_activated)), + interrupt_status: Arc::new(AtomicUsize::new(state.interrupt_status)), + virtio_interrupt: Some(interrupt), + memory: memory.clone(), + settings_bar: 0, + use_64bit_bar: true, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info, + bar_regions: state.bar_configuration, + }; + + if state.device_activated { + virtio_pci_device + .device + .lock() + .expect("Poisoned lock") + .activate( + memory, + virtio_pci_device.virtio_interrupt.as_ref().unwrap().clone(), + ); + } + + Ok(virtio_pci_device) + } + fn is_driver_ready(&self) -> bool { let ready_bits = (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); @@ -657,6 +726,27 @@ impl VirtioPciDevice { } Ok(()) } + + pub fn state(&self) -> VirtioPciDeviceState { + VirtioPciDeviceState { + pci_device_bdf: self.pci_device_bdf, + device_activated: self.device_activated.load(Ordering::Acquire), + interrupt_status: self.interrupt_status.load(Ordering::Acquire), + cap_pci_cfg_offset: self.cap_pci_cfg_info.offset, + cap_pci_cfg: self.cap_pci_cfg_info.cap.bytes().to_vec(), + pci_configuration_state: self.configuration.state(), + pci_dev_state: self.common_config.state(), + msix_state: self + .msix_config + .as_ref() + .unwrap() + .lock() + .expect("Poisoned lock") + .state(), + msi_vector_group: self.interrupt_source_group.save(), + bar_configuration: self.bar_regions.clone(), + } + } } pub struct VirtioInterruptMsix { @@ -796,57 +886,33 @@ impl PciDevice for VirtioPciDevice { &mut self, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, - resources: Option>, + _resources: Option>, ) -> std::result::Result, PciDeviceError> { let mut bars = Vec::new(); let device_clone = self.device.clone(); let device = device_clone.lock().unwrap(); - let mut settings_bar_addr = None; - let mut use_64bit_bar = self.use_64bit_bar; - let restoring = resources.is_some(); - if let Some(resources) = resources { - for resource in resources { - if let Resource::PciBar { - index, base, type_, .. - } = resource - { - if index == VIRTIO_COMMON_BAR_INDEX { - settings_bar_addr = Some(GuestAddress(base)); - use_64bit_bar = match type_ { - PciBarType::Io => { - return Err(PciDeviceError::InvalidResource(resource)); - } - PciBarType::Mmio32 => false, - PciBarType::Mmio64 => true, - }; - break; - } - } - } - // Error out if no resource was matching the BAR id. - if settings_bar_addr.is_none() { - return Err(PciDeviceError::MissingResource); - } - } - // Allocate the virtio-pci capability BAR. // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 - let policy = match settings_bar_addr { - Some(addr) => AllocPolicy::ExactMatch(addr.0), - None => AllocPolicy::FirstMatch, - }; - let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let (virtio_pci_bar_addr, region_type) = if self.use_64bit_bar { let region_type = PciBarRegionType::Memory64BitRegion; let addr = mmio64_allocator - .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) .unwrap() .start(); (addr, region_type) } else { let region_type = PciBarRegionType::Memory32BitRegion; let addr = mmio32_allocator - .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) .unwrap() .start(); (addr, region_type) @@ -862,14 +928,12 @@ impl PciDevice for VirtioPciDevice { // happen only during the creation of a brand new VM. When a VM is // restored from a known state, the BARs are already created with the // right content, therefore we don't need to go through this codepath. - if !restoring { - self.configuration - .add_pci_bar(&bar) - .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; - // Once the BARs are allocated, the capabilities can be added to the PCI configuration. - self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; - } + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; bars.push(bar); @@ -1015,6 +1079,7 @@ impl PciDevice for VirtioPciDevice { Arc::clone(self.virtio_interrupt.as_ref().unwrap()), ) .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + self.device_activated.store(true, Ordering::SeqCst); } else { debug!("Device doesn't need activation"); } From b424d279ed4e4ca734a6aa782e88f74b8cfebcdd Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 13 Jun 2025 16:45:02 +0200 Subject: [PATCH 19/27] refactor(vm): move ResourceAllocator inside Vm ResourceAllocator object was part of DeviceManager since it is (mainly) devices that use it. ResourceAllocator is as well the object that implements (in a dummy way, for the moment) the DeviceRelocation trait which PciDevices use to move the address space of a PciDevice when triggered from the guest. Problem with DeviceRelocation is that it also needs the Vm file descriptor to perform the relocation, because we need to move register the new IO event fd for VirtIO devices. To make things simpler, move ResourceAllocator inside the Vm object. In subsequent commit we will remove the DeviceRelocation from ResourceAllocator and move it to Vm instead. This has the nice secondary effect that we were able to simplify the signature of many device-related methods that received Vm and ResourceAllocator arguments. Signed-off-by: Babis Chalios --- src/vmm/src/acpi/mod.rs | 50 ++++++++--------- src/vmm/src/arch/aarch64/fdt.rs | 9 +--- src/vmm/src/arch/x86_64/mod.rs | 11 ++-- src/vmm/src/arch/x86_64/mptable.rs | 2 +- src/vmm/src/arch/x86_64/vm.rs | 5 ++ src/vmm/src/builder.rs | 21 ++++---- src/vmm/src/device_manager/legacy.rs | 11 ++-- src/vmm/src/device_manager/mmio.rs | 48 ++++------------- src/vmm/src/device_manager/mod.rs | 54 +++++-------------- src/vmm/src/device_manager/pci_mngr.rs | 29 +++------- src/vmm/src/device_manager/persist.rs | 24 ++------- src/vmm/src/devices/acpi/vmgenid.rs | 2 +- src/vmm/src/devices/pci/pci_segment.rs | 2 +- .../devices/virtio/transport/pci/device.rs | 6 +-- src/vmm/src/lib.rs | 4 +- src/vmm/src/vstate/mod.rs | 2 + .../{device_manager => vstate}/resources.rs | 17 +++--- src/vmm/src/vstate/vm.rs | 20 +++---- 18 files changed, 113 insertions(+), 204 deletions(-) rename src/vmm/src/{device_manager => vstate}/resources.rs (96%) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index a3e471aed9e..51711d9eb92 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -12,8 +12,8 @@ use crate::acpi::x86_64::{ }; use crate::arch::x86_64::layout; use crate::device_manager::DeviceManager; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; mod x86_64; @@ -80,7 +80,11 @@ impl AcpiTableWriter<'_> { } /// Build the DSDT table for the guest - fn build_dsdt(&mut self, device_manager: &mut DeviceManager) -> Result { + fn build_dsdt( + &mut self, + device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, + ) -> Result { let mut dsdt_data = Vec::new(); // Virtio-devices DSDT data @@ -99,7 +103,7 @@ impl AcpiTableWriter<'_> { setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&device_manager.resource_allocator, &mut dsdt) + self.write_acpi_table(resource_allocator, &mut dsdt) } /// Build the FADT table for the guest @@ -193,26 +197,16 @@ impl AcpiTableWriter<'_> { pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; - let dsdt_addr = writer.build_dsdt(device_manager)?; - - let fadt_addr = writer.build_fadt(&device_manager.resource_allocator, dsdt_addr)?; - let madt_addr = writer.build_madt( - &device_manager.resource_allocator, - vcpus.len().try_into().unwrap(), - )?; - let mcfg_addr = writer.build_mcfg( - &device_manager.resource_allocator, - layout::PCI_MMCONFIG_START, - )?; - let xsdt_addr = writer.build_xsdt( - &device_manager.resource_allocator, - fadt_addr, - madt_addr, - mcfg_addr, - )?; + let dsdt_addr = writer.build_dsdt(device_manager, resource_allocator)?; + + let fadt_addr = writer.build_fadt(resource_allocator, dsdt_addr)?; + let madt_addr = writer.build_madt(resource_allocator, vcpus.len().try_into().unwrap())?; + let mcfg_addr = writer.build_mcfg(resource_allocator, layout::PCI_MMCONFIG_START)?; + let xsdt_addr = writer.build_xsdt(resource_allocator, fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } @@ -224,8 +218,8 @@ mod tests { use crate::acpi::{AcpiError, AcpiTableWriter}; use crate::arch::x86_64::layout::{SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; use crate::builder::tests::default_vmm; - use crate::device_manager::resources::ResourceAllocator; use crate::utils::u64_to_usize; + use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::tests::setup_vm_with_memory; struct MockSdt(Vec); @@ -259,14 +253,14 @@ mod tests { // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -281,27 +275,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index a2a4992eb29..0073d7dbc05 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -536,14 +536,7 @@ mod tests { let dummy = Arc::new(Mutex::new(DummyDevice::new())); device_manager .mmio_devices - .register_virtio_test_device( - &vm, - mem.clone(), - &device_manager.resource_allocator, - dummy, - &mut cmdline, - "dummy", - ) + .register_virtio_test_device(&vm, mem.clone(), dummy, &mut cmdline, "dummy") .unwrap(); create_fdt( diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 68b903d5ff6..5307dbdf710 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -217,7 +217,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vm.guest_memory(), - &device_manager.resource_allocator, + &vm.common.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -238,7 +238,12 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables(vm.guest_memory(), device_manager, vcpus)?; + create_acpi_tables( + vm.guest_memory(), + device_manager, + &vm.common.resource_allocator, + vcpus, + )?; Ok(()) } @@ -568,9 +573,9 @@ mod tests { use linux_loader::loader::bootparam::boot_e820_entry; use super::*; - use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; use crate::utils::mib_to_bytes; + use crate::vstate::resources::ResourceAllocator; #[test] fn regions_lt_4gb() { diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index c397290c23e..17b2900aeb2 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -15,10 +15,10 @@ use vm_allocator::AllocPolicy; use crate::arch::IRQ_MAX; use crate::arch::x86_64::generated::mpspec; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{ Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, }; +use crate::vstate::resources::ResourceAllocator; // These `mpspec` wrapper types are only data, reading them from data is a safe initialization. // SAFETY: POD diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..9d22bf9a757 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -11,8 +11,10 @@ use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; use crate::arch::x86_64::msr::MsrError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocatorState; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -187,6 +189,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), + resource_allocator: self.common.resource_allocator.save(), pitstate, clock, pic_master, @@ -211,6 +214,8 @@ impl ArchVm { pub struct VmState { /// guest memory state pub memory: GuestMemoryState, + /// resource allocator + pub resource_allocator: ResourceAllocatorState, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index d8d69a97314..4b998fdf138 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -47,6 +47,8 @@ use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; +#[cfg(target_arch = "aarch64")] +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::vstate::vm::{Vm, VmError}; use crate::{EventManager, Vmm, VmmError}; @@ -188,7 +190,7 @@ pub fn build_microvm_for_boot( .collect::, _>>()?; if vm_resources.pci_enabled { - device_manager.enable_pci()?; + device_manager.enable_pci(&vm)?; } else { boot_cmdline.insert("pci", "off")?; } @@ -197,7 +199,7 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - device_manager.attach_boot_timer_device(request_ts)?; + device_manager.attach_boot_timer_device(&vm, request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { @@ -252,7 +254,7 @@ pub fn build_microvm_for_boot( #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut device_manager, &mut vcpus)?; + setup_pvtime(&vm.common.resource_allocator, &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } @@ -513,13 +515,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = device_manager - .resource_allocator + let addr = resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; Ok(GuestAddress(addr)) @@ -528,12 +529,12 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] fn setup_pvtime( - device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpus: &mut [Vcpu], ) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region let pvtime_mem: GuestAddress = allocate_pvtime_region( - device_manager, + resource_allocator, vcpus.len(), vm_allocator::AllocPolicy::LastMatch, )?; @@ -1141,7 +1142,9 @@ pub(crate) mod tests { let mut vmm = default_vmm(); let request_ts = TimestampUs::default(); - let res = vmm.device_manager.attach_boot_timer_device(request_ts); + let res = vmm + .device_manager + .attach_boot_timer_device(&vmm.vm, request_ts); res.unwrap(); assert!(vmm.device_manager.mmio_devices.boot_timer.is_some()); } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 7011ae71122..47b259ef87b 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -97,11 +97,7 @@ impl PortIODeviceManager { } /// Register supported legacy devices. - pub fn register_devices( - &mut self, - io_bus: &vm_device::Bus, - vm: &Vm, - ) -> Result<(), LegacyDeviceError> { + pub fn register_devices(&mut self, vm: &Vm) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, @@ -122,6 +118,8 @@ impl PortIODeviceManager { ), input: None, })); + + let io_bus = &vm.common.resource_allocator.pio_bus; io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -243,7 +241,6 @@ mod tests { #[test] fn test_register_legacy_devices() { let (_, vm) = setup_vm_with_memory(0x1000); - let io_bus = vm_device::Bus::new(); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(SerialDevice { @@ -261,6 +258,6 @@ mod tests { )), ) .unwrap(); - ldm.register_devices(&io_bus, &vm).unwrap(); + ldm.register_devices(&vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 2d6cde39c52..da32cf14271 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -19,7 +19,6 @@ use log::debug; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; -use super::resources::ResourceAllocator; use crate::Vm; use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] @@ -31,6 +30,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::mmio::MmioTransport; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; +use crate::vstate::resources::ResourceAllocator; /// Errors for MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -180,7 +180,6 @@ impl MMIODeviceManager { &mut self, vm: &Vm, device_id: String, - mmio_bus: &vm_device::Bus, device: MMIODevice, ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. @@ -203,7 +202,7 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - mmio_bus.insert( + vm.common.resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -239,13 +238,12 @@ impl MMIODeviceManager { pub fn register_mmio_virtio_for_boot( &mut self, vm: &Vm, - resource_allocator: &ResourceAllocator, device_id: String, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { let device = MMIODevice { - resources: self.allocate_mmio_resources(resource_allocator, 1)?, + resources: self.allocate_mmio_resources(&vm.common.resource_allocator, 1)?, inner: Arc::new(Mutex::new(mmio_device)), }; @@ -261,7 +259,7 @@ impl MMIODeviceManager { device.resources.irq.unwrap().get(), )?; } - self.register_mmio_virtio(vm, device_id, &resource_allocator.mmio_bus, device)?; + self.register_mmio_virtio(vm, device_id, device)?; Ok(()) } @@ -271,7 +269,6 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &Vm, - resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -280,7 +277,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -299,7 +296,7 @@ impl MMIODeviceManager { inner: serial, }; - resource_allocator.mmio_bus.insert( + vm.common.resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -481,20 +478,13 @@ pub(crate) mod tests { &mut self, vm: &Vm, guest_mem: GuestMemoryMmap, - resource_allocator: &ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); - self.register_mmio_virtio_for_boot( - vm, - resource_allocator, - dev_id.to_string(), - mmio_device, - cmdline, - )?; + self.register_mmio_virtio_for_boot(vm, dev_id.to_string(), mmio_device, cmdline)?; Ok(self .get_virtio_device(device.lock().unwrap().device_type(), dev_id) .unwrap() @@ -601,7 +591,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -614,7 +603,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, dummy, &mut cmdline, "dummy", @@ -655,7 +643,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -668,7 +655,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -682,7 +668,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -717,21 +702,13 @@ pub(crate) mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); let type_id = dummy.lock().unwrap().device_type(); let id = String::from("foo"); let addr = device_manager - .register_virtio_test_device( - &vm, - vm.guest_memory().clone(), - &resource_allocator, - dummy, - &mut cmdline, - &id, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy, &mut cmdline, &id) .unwrap(); assert!(device_manager.get_virtio_device(type_id, &id).is_some()); assert_eq!( @@ -755,14 +732,7 @@ pub(crate) mod tests { let dummy2 = Arc::new(Mutex::new(DummyDevice::new())); let id2 = String::from("foo2"); device_manager - .register_virtio_test_device( - &vm, - vm.guest_memory().clone(), - &resource_allocator, - dummy2, - &mut cmdline, - &id2, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy2, &mut cmdline, &id2) .unwrap(); let mut count = 0; diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index d3770b62109..0552ecb1bd3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -18,7 +18,6 @@ use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; -use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; use utils::time::TimestampUs; use vmm_sys_util::eventfd::EventFd; @@ -54,8 +53,6 @@ pub mod mmio; pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; -/// Resource manager for devices. -pub mod resources; #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while creating a new [`DeviceManager`] @@ -93,8 +90,6 @@ pub enum AttachDeviceError { #[derive(Debug)] /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { - /// Allocator for system memory and interrupt numbers - pub resource_allocator: Arc, /// MMIO devices pub mmio_devices: MMIODeviceManager, #[cfg(target_arch = "x86_64")] @@ -139,7 +134,6 @@ impl DeviceManager { event_manager: &mut EventManager, vcpus_exit_evt: &EventFd, vm: &Vm, - resource_allocator: &ResourceAllocator, ) -> Result { Self::set_stdout_nonblocking(); @@ -153,7 +147,7 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; + legacy_devices.register_devices(vm)?; Ok(legacy_devices) } @@ -163,13 +157,10 @@ impl DeviceManager { vcpus_exit_evt: &EventFd, vm: &Vm, ) -> Result { - let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] - let legacy_devices = - Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm, &resource_allocator)?; + let legacy_devices = Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm)?; Ok(DeviceManager { - resource_allocator, mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] legacy_devices, @@ -193,13 +184,8 @@ impl DeviceManager { // The device mutex mustn't be locked here otherwise it will deadlock. let device = MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); - self.mmio_devices.register_mmio_virtio_for_boot( - vm, - &self.resource_allocator, - id, - device, - cmdline, - )?; + self.mmio_devices + .register_mmio_virtio_for_boot(vm, id, device, cmdline)?; Ok(()) } @@ -214,8 +200,7 @@ impl DeviceManager { is_vhost_user: bool, ) -> Result<(), AttachDeviceError> { if self.pci_devices.pci_segment.is_some() { - self.pci_devices - .attach_pci_virtio_device(vm, &self.resource_allocator, id, device)?; + self.pci_devices.attach_pci_virtio_device(vm, id, device)?; } else { self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; } @@ -226,12 +211,13 @@ impl DeviceManager { /// Attaches a [`BootTimer`] to the VM pub(crate) fn attach_boot_timer_device( &mut self, + vm: &Vm, request_ts: TimestampUs, ) -> Result<(), AttachDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&self.resource_allocator.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&vm.common.resource_allocator.mmio_bus, boot_timer)?; Ok(()) } @@ -241,7 +227,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vm: &Vm, ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &vm.common.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } @@ -265,21 +251,19 @@ impl DeviceManager { // Make stdout non-blocking. Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; - self.mmio_devices - .register_mmio_serial(vm, &self.resource_allocator, serial, None)?; + self.mmio_devices.register_mmio_serial(vm, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } let rtc = Arc::new(Mutex::new(RTCDevice::new())); self.mmio_devices - .register_mmio_rtc(&self.resource_allocator, rtc, None)?; + .register_mmio_rtc(&vm.common.resource_allocator, rtc, None)?; Ok(()) } /// Enables PCIe support for Firecracker devices - pub fn enable_pci(&mut self) -> Result<(), PciManagerError> { - self.pci_devices - .attach_pci_segment(&self.resource_allocator) + pub fn enable_pci(&mut self, vm: &Arc) -> Result<(), PciManagerError> { + self.pci_devices.attach_pci_segment(vm) } fn do_kick_device(virtio_device: Arc>) { @@ -397,8 +381,6 @@ impl DeviceManager { #[derive(Debug, Default, Clone, Serialize, Deserialize)] /// State of devices in the system pub struct DevicesState { - /// Resource allocator state - pub resource_allocator_state: resources::ResourceAllocatorState, /// MMIO devices state pub mmio_state: persist::DeviceStates, /// ACPI devices state @@ -454,7 +436,6 @@ impl<'a> Persist<'a> for DeviceManager { fn save(&self) -> Self::State { DevicesState { - resource_allocator_state: self.resource_allocator.save(), mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), pci_state: self.pci_devices.save(), @@ -465,17 +446,12 @@ impl<'a> Persist<'a> for DeviceManager { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - // Safe to unwrap here. ResourceAllocator restoring cannot fail. - let resource_allocator = - Arc::new(ResourceAllocator::restore((), &state.resource_allocator_state).unwrap()); - // Setup legacy devices in case of x86 #[cfg(target_arch = "x86_64")] let legacy_devices = Self::create_legacy_devices( constructor_args.event_manager, constructor_args.vcpus_exit_evt, constructor_args.vm, - &resource_allocator, )?; // Restore MMIO devices @@ -483,7 +459,6 @@ impl<'a> Persist<'a> for DeviceManager { mem: constructor_args.mem, vm: constructor_args.vm, event_manager: constructor_args.event_manager, - resource_allocator: &resource_allocator, vm_resources: constructor_args.vm_resources, instance_id: constructor_args.instance_id, restored_from_file: constructor_args.restored_from_file, @@ -493,7 +468,6 @@ impl<'a> Persist<'a> for DeviceManager { // Restore ACPI devices let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { mem: constructor_args.mem, - resource_allocator: &resource_allocator, vm: constructor_args.vm, }; let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; @@ -501,7 +475,6 @@ impl<'a> Persist<'a> for DeviceManager { // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { - resource_allocator: &resource_allocator, vm: constructor_args.vm.clone(), mem: constructor_args.mem, vm_resources: constructor_args.vm_resources, @@ -512,7 +485,6 @@ impl<'a> Persist<'a> for DeviceManager { let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; let device_manager = DeviceManager { - resource_allocator, mmio_devices, #[cfg(target_arch = "x86_64")] legacy_devices, @@ -578,7 +550,6 @@ pub(crate) mod tests { let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); let pci_devices = PciDevices::new(); - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] let legacy_devices = PortIODeviceManager::new( @@ -592,7 +563,6 @@ pub(crate) mod tests { .unwrap(); DeviceManager { - resource_allocator, mmio_devices, #[cfg(target_arch = "x86_64")] legacy_devices, diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 26a44dd29c9..199c6ec3c7c 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use vm_device::BusError; use super::persist::{MmdsVersionState, SharedDeviceType}; -use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; @@ -34,6 +33,7 @@ use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; use crate::{EventManager, Vm}; @@ -68,17 +68,14 @@ impl PciDevices { Default::default() } - pub fn attach_pci_segment( - &mut self, - resource_allocator: &Arc, - ) -> Result<(), PciManagerError> { + pub fn attach_pci_segment(&mut self, vm: &Arc) -> Result<(), PciManagerError> { // We only support a single PCIe segment. Calling this function twice is a Firecracker // internal error. assert!(self.pci_segment.is_none()); // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts // only. - let pci_segment = PciSegment::new(0, resource_allocator, &[0u8; 32])?; + let pci_segment = PciSegment::new(0, &vm.common.resource_allocator, &[0u8; 32])?; self.pci_segment = Some(pci_segment); Ok(()) @@ -128,7 +125,6 @@ impl PciDevices { >( &mut self, vm: &Arc, - resource_allocator: &ResourceAllocator, id: String, device: Arc>, ) -> Result<(), PciManagerError> { @@ -137,17 +133,14 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); + let resource_allocator = &vm.common.resource_allocator; let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration let msix_num = u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); - let msix_vectors = Arc::new(Vm::create_msix_group( - vm.clone(), - resource_allocator, - msix_num, - )?); + let msix_vectors = Arc::new(Vm::create_msix_group(vm.clone(), msix_num)?); // Create the transport let mut virtio_device = @@ -187,7 +180,6 @@ impl PciDevices { fn restore_pci_device( &mut self, vm: &Arc, - resource_allocator: &ResourceAllocator, device: Arc>, device_id: &str, transport_state: &VirtioPciDeviceState, @@ -221,7 +213,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, device_id.to_string()), virtio_device.clone()); - Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(&vm.common.resource_allocator, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") @@ -276,7 +268,6 @@ pub struct PciDevicesState { pub struct PciDevicesConstructorArgs<'a> { pub vm: Arc, pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a Arc, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -288,7 +279,6 @@ impl<'a> Debug for PciDevicesConstructorArgs<'a> { f.debug_struct("PciDevicesConstructorArgs") .field("vm", &self.vm) .field("mem", &self.mem) - .field("resource_allocator", &self.resource_allocator) .field("vm_resources", &self.vm_resources) .field("instance_id", &self.instance_id) .field("restored_from_file", &self.restored_from_file) @@ -437,7 +427,7 @@ impl<'a> Persist<'a> for PciDevices { return Ok(pci_devices); } - pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + pci_devices.attach_pci_segment(&constructor_args.vm)?; if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new( @@ -459,7 +449,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &balloon_state.device_id, &balloon_state.transport_state, @@ -485,7 +474,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &block_state.device_id, &block_state.transport_state, @@ -536,7 +524,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &net_state.device_id, &net_state.transport_state, @@ -570,7 +557,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &vsock_state.device_id, &vsock_state.transport_state, @@ -594,7 +580,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &entropy_state.device_id, &entropy_state.transport_state, diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 93385805e7b..24a0d3cca3d 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use super::acpi::ACPIDeviceManager; use super::mmio::*; -use super::resources::ResourceAllocator; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; @@ -165,7 +164,6 @@ pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm: &'a Vm, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -191,7 +189,6 @@ pub struct ACPIDeviceManagerState { #[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a ResourceAllocator, pub vm: &'a Vm, } @@ -223,7 +220,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: constructor_args.resource_allocator, + resource_allocator: &constructor_args.vm.common.resource_allocator, }, vmgenid_args, )?; @@ -387,17 +384,12 @@ impl<'a> Persist<'a> for MMIODeviceManager { .event_manager .add_subscriber(serial.clone()); - dev_manager.register_mmio_serial( - vm, - constructor_args.resource_allocator, - serial, - Some(state.device_info), - )?; + dev_manager.register_mmio_serial(vm, serial, Some(state.device_info))?; } if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - constructor_args.resource_allocator, + &constructor_args.vm.common.resource_allocator, rtc, Some(state.device_info), )?; @@ -412,7 +404,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { id: &String, state: &MmioTransportState, device_info: &MMIODeviceInfo, - mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { let interrupt = Arc::new(IrqTrigger::new()); @@ -430,7 +421,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_virtio( vm, id.clone(), - mmio_bus, MMIODevice { resources: *device_info, inner: mmio_transport, @@ -469,7 +459,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.device_id, &balloon_state.transport_state, &balloon_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -492,7 +481,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.device_id, &block_state.transport_state, &block_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -539,7 +527,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.device_id, &net_state.transport_state, &net_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -569,7 +556,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.device_id, &vsock_state.transport_state, &vsock_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -594,7 +580,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.device_id, &entropy_state.transport_state, &entropy_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -725,14 +710,11 @@ mod tests { let vmm = default_vmm(); let device_manager_state: device_manager::DevicesState = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); - let resource_allocator = - ResourceAllocator::restore((), &device_manager_state.resource_allocator_state).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), vm: &vmm.vm, event_manager: &mut event_manager, - resource_allocator: &resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 0cf0ae0d7b1..5c8d4ecbc51 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -11,9 +11,9 @@ use vm_superio::Trigger; use vmm_sys_util::eventfd::EventFd; use super::super::legacy::EventFdTrigger; -use crate::device_manager::resources::ResourceAllocator; use crate::snapshot::Persist; use crate::vstate::memory::{Bytes, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; /// Bytes of memory we allocate for VMGenID device pub const VMGENID_MEM_SIZE: u64 = 16; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index c1e8bb07cb8..e957332bb0e 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -22,7 +22,7 @@ use vm_allocator::AddressAllocator; use vm_device::{BusDeviceSync, BusError}; use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; -use crate::device_manager::resources::ResourceAllocator; +use crate::vstate::resources::ResourceAllocator; pub struct PciSegment { pub(crate) id: u16, diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 6793d502f00..384ad0358dd 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -33,7 +33,6 @@ use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; use crate::Vm; -use crate::device_manager::resources::ResourceAllocator; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::transport::pci::common_config::{ @@ -45,6 +44,7 @@ use crate::logger::{debug, error}; use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; const DEVICE_INIT: u8 = 0x00; @@ -1153,7 +1153,7 @@ mod tests { #[test] fn test_pci_device_config() { let mut vmm = default_vmm(); - vmm.device_manager.enable_pci(); + vmm.device_manager.enable_pci(&vmm.vm); let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); vmm.device_manager .attach_virtio_device( @@ -1271,7 +1271,7 @@ mod tests { #[test] fn test_reading_bars() { let mut vmm = default_vmm(); - vmm.device_manager.enable_pci(); + vmm.device_manager.enable_pci(&vmm.vm); let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); vmm.device_manager .attach_virtio_device( diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index d4cb5a78344..d65fcdbeed5 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -373,10 +373,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.device_manager.resource_allocator.mmio_bus.clone()); + vcpu.set_mmio_bus(self.vm.common.resource_allocator.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.device_manager.resource_allocator.pio_bus.clone()); + .set_pio_bus(self.vm.common.resource_allocator.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); diff --git a/src/vmm/src/vstate/mod.rs b/src/vmm/src/vstate/mod.rs index 47458835e04..f4fa25914d0 100644 --- a/src/vmm/src/vstate/mod.rs +++ b/src/vmm/src/vstate/mod.rs @@ -5,6 +5,8 @@ pub mod kvm; /// Module with GuestMemory implementation. pub mod memory; +/// Resource manager for devices. +pub mod resources; /// Module with Vcpu implementation. pub mod vcpu; /// Module with Vm implementation. diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/vstate/resources.rs similarity index 96% rename from src/vmm/src/device_manager/resources.rs rename to src/vmm/src/vstate/resources.rs index f7035e55566..3b77b892bc3 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -22,13 +22,13 @@ use crate::snapshot::Persist; /// * Memory allocations in the MMIO address space #[derive(Debug)] pub struct ResourceAllocator { - // Allocator for device interrupt lines + /// Allocator for device interrupt lines pub gsi_allocator: Arc>, - // Allocator for memory in the 32-bit MMIO address space + /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: Arc>, - // Allocator for memory in the 64-bit MMIO address space + /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: Arc>, - // Memory allocator for system data + /// Memory allocator for system data pub system_memory: Arc>, /// MMIO bus pub mmio_bus: Arc, @@ -186,14 +186,15 @@ impl<'a> Persist<'a> for ResourceAllocator { } #[derive(Debug, Clone, Serialize, Deserialize)] +/// State of a ResourceAllocator pub struct ResourceAllocatorState { - // Allocator for device interrupt lines + /// Allocator for device interrupt lines pub gsi_allocator: Arc>, - // Allocator for memory in the 32-bit MMIO address space + /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: Arc>, - // Allocator for memory in the 64-bit MMIO address space + /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: Arc>, - // Memory allocator for system data + /// Memory allocator for system data pub system_memory: Arc>, } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 950bcac652d..a2c3a65be6b 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -28,7 +28,6 @@ use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; -use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::snapshot::Persist; @@ -37,6 +36,7 @@ use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, }; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; @@ -244,6 +244,8 @@ pub struct VmCommon { pub guest_memory: GuestMemoryMmap, /// Interrupts used by Vm's devices pub interrupts: Mutex>, + /// Allocator for VM resources + pub resource_allocator: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -265,6 +267,8 @@ pub enum VmError { NotEnoughMemorySlots, /// Memory Error: {0} VmMemory(#[from] vm_memory::Error), + /// ResourceAllocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error) } /// Contains Vm functions that are usable across CPU architectures @@ -312,6 +316,7 @@ impl Vm { max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), + resource_allocator: Arc::new(ResourceAllocator::new()?), }) } @@ -565,14 +570,12 @@ impl Vm { } /// Create a group of MSI-X interrupts - pub fn create_msix_group( - vm: Arc, - resource_allocator: &ResourceAllocator, - count: u16, - ) -> Result { + pub fn create_msix_group(vm: Arc, count: u16) -> Result { debug!("Creating new MSI group with {count} vectors"); let mut irq_routes = HashMap::with_capacity(count as usize); - for (gsi, i) in resource_allocator + for (gsi, i) in vm + .common + .resource_allocator .allocate_gsi(count as u32)? .iter() .zip(0u32..) @@ -723,8 +726,7 @@ pub(crate) mod tests { } fn create_msix_group(vm: &Arc) -> MsiVectorGroup { - let resource_allocator = ResourceAllocator::new().unwrap(); - Vm::create_msix_group(vm.clone(), &resource_allocator, 4).unwrap() + Vm::create_msix_group(vm.clone(), 4).unwrap() } #[test] From be094128e7b576483c54b676ae0b3625906b3558 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 13 Jun 2025 17:09:46 +0200 Subject: [PATCH 20/27] refactor(vm): move `Bus` objects to Vm We had previously added MMIO and Port IO buses inside ResourceAllocator so that we could implement DeviceRelocation for the type. Now, we will delegate device relocation responsibilities to ArchVm instead. That is because device relocation requires access to the Vm file descriptor as well. As a result, we can move buses to the Vm object itself. Add MMIO bus to VmCommon as both architectures use it. Add PortIO bus for x86 architecture only. Not that we don't still support DeviceRelocation. VirtIO devices should not request us to relocate them. Also, for adding such support we would need to also support VirtIO reset. We will look into adding this functionaliyt later on. Signed-off-by: Babis Chalios --- src/vmm/src/arch/x86_64/vm.rs | 6 +++ src/vmm/src/device_manager/legacy.rs | 2 +- src/vmm/src/device_manager/mmio.rs | 10 ++--- src/vmm/src/device_manager/mod.rs | 5 +-- src/vmm/src/device_manager/pci_mngr.rs | 24 ++++------ src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/pci/pci_segment.rs | 61 ++++++++++++-------------- src/vmm/src/lib.rs | 5 +-- src/vmm/src/vstate/resources.rs | 26 ----------- src/vmm/src/vstate/vm.rs | 20 ++++++++- 10 files changed, 73 insertions(+), 88 deletions(-) diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index 9d22bf9a757..fbc27c82a60 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::Arc; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -58,6 +59,8 @@ pub struct ArchVm { /// /// `None` if `KVM_CAP_XSAVE2` not supported. xsave2_size: Option, + /// Port IO bus + pub pio_bus: Arc, } impl ArchVm { @@ -92,10 +95,13 @@ impl ArchVm { .set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) .map_err(ArchVmError::SetTssAddress)?; + let pio_bus = Arc::new(vm_device::Bus::new()); + Ok(ArchVm { common, msrs_to_save, xsave2_size, + pio_bus, }) } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 47b259ef87b..d0194e24e62 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -119,7 +119,7 @@ impl PortIODeviceManager { input: None, })); - let io_bus = &vm.common.resource_allocator.pio_bus; + let io_bus = &vm.pio_bus; io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index da32cf14271..13ab13f47ea 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -202,7 +202,7 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - vm.common.resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -296,7 +296,7 @@ impl MMIODeviceManager { inner: serial, }; - vm.common.resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -327,7 +327,7 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - resource_allocator: &ResourceAllocator, + vm: &Vm, rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -336,7 +336,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -349,7 +349,7 @@ impl MMIODeviceManager { inner: rtc, }; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 0552ecb1bd3..0b610ccf454 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -217,7 +217,7 @@ impl DeviceManager { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&vm.common.resource_allocator.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&vm.common.mmio_bus, boot_timer)?; Ok(()) } @@ -256,8 +256,7 @@ impl DeviceManager { } let rtc = Arc::new(Mutex::new(RTCDevice::new())); - self.mmio_devices - .register_mmio_rtc(&vm.common.resource_allocator, rtc, None)?; + self.mmio_devices.register_mmio_rtc(vm, rtc, None)?; Ok(()) } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 199c6ec3c7c..303a34a3448 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -33,7 +33,6 @@ use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; -use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; use crate::{EventManager, Vm}; @@ -75,14 +74,14 @@ impl PciDevices { // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts // only. - let pci_segment = PciSegment::new(0, &vm.common.resource_allocator, &[0u8; 32])?; + let pci_segment = PciSegment::new(0, vm, &[0u8; 32])?; self.pci_segment = Some(pci_segment); Ok(()) } fn register_bars_with_bus( - resource_allocator: &ResourceAllocator, + vm: &Vm, virtio_device: &Arc>, ) -> Result<(), PciManagerError> { for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { @@ -94,11 +93,8 @@ impl PciDevices { bar.size() ); #[cfg(target_arch = "x86_64")] - resource_allocator.pio_bus.insert( - virtio_device.clone(), - bar.addr(), - bar.size(), - )?; + vm.pio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; #[cfg(target_arch = "aarch64")] log::error!("pci: We do not support I/O region allocation") } @@ -108,11 +104,9 @@ impl PciDevices { bar.addr(), bar.size() ); - resource_allocator.mmio_bus.insert( - virtio_device.clone(), - bar.addr(), - bar.size(), - )?; + vm.common + .mmio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; } } } @@ -168,7 +162,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, id.clone()), virtio_device.clone()); - Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(vm, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") @@ -213,7 +207,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, device_id.to_string()), virtio_device.clone()); - Self::register_bars_with_bus(&vm.common.resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(vm, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 24a0d3cca3d..87358181df9 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -389,7 +389,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - &constructor_args.vm.common.resource_allocator, + constructor_args.vm, rtc, Some(state.device_info), )?; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index e957332bb0e..c37763eab3a 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -21,7 +21,7 @@ use uuid::Uuid; use vm_allocator::AddressAllocator; use vm_device::{BusDeviceSync, BusError}; -use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::arch::{ArchVm as Vm, PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; use crate::vstate::resources::ResourceAllocator; pub struct PciSegment { @@ -67,28 +67,21 @@ impl std::fmt::Debug for PciSegment { } impl PciSegment { - fn build( - id: u16, - resource_allocator: &Arc, - pci_irq_slots: &[u8; 32], - ) -> Result { + fn build(id: u16, vm: &Arc, pci_irq_slots: &[u8; 32]) -> Result { let pci_root = PciRoot::new(None); - let pci_bus = Arc::new(Mutex::new(PciBus::new( - pci_root, - resource_allocator.clone(), - ))); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, vm.clone()))); let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( Arc::clone(&pci_config_mmio) as Arc, mmio_config_address, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, )?; - let mem32_allocator = resource_allocator.mmio32_memory.clone(); - let mem64_allocator = resource_allocator.mmio64_memory.clone(); + let mem32_allocator = vm.common.resource_allocator.mmio32_memory.clone(); + let mem64_allocator = vm.common.resource_allocator.mmio64_memory.clone(); let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); @@ -119,13 +112,15 @@ impl PciSegment { #[cfg(target_arch = "x86_64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let mut segment = Self::build(id, resource_allocator, pci_irq_slots)?; + use crate::Vm; + + let mut segment = Self::build(id, vm, pci_irq_slots)?; let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); - resource_allocator.pio_bus.insert( + vm.pio_bus.insert( pci_config_io.clone(), PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, @@ -151,10 +146,10 @@ impl PciSegment { #[cfg(target_arch = "aarch64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let segment = Self::build(id, resource_allocator, pci_irq_slots)?; + let segment = Self::build(id, vm, pci_irq_slots)?; info!( "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", @@ -468,13 +463,14 @@ mod tests { use super::*; use crate::arch; + use crate::builder::tests::default_vmm; use crate::utils::u64_to_usize; #[test] fn test_pci_segment_build() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); assert_eq!(pci_segment.id, 0); assert_eq!( @@ -503,17 +499,14 @@ mod tests { #[cfg(target_arch = "x86_64")] #[test] fn test_io_bus() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; - resource_allocator - .pio_bus - .read(PCI_CONFIG_IO_PORT, &mut data) - .unwrap(); + vmm.vm.pio_bus.read(PCI_CONFIG_IO_PORT, &mut data).unwrap(); - resource_allocator + vmm.vm .pio_bus .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) .unwrap_err(); @@ -521,17 +514,19 @@ mod tests { #[test] fn test_mmio_bus() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; - resource_allocator + vmm.vm + .common .mmio_bus .read(pci_segment.mmio_config_address, &mut data) .unwrap(); - resource_allocator + vmm.vm + .common .mmio_bus .read( pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, @@ -542,9 +537,9 @@ mod tests { #[test] fn test_next_device_bdf() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); // Start checking from device id 1, since 0 is allocated to the Root port. for dev_id in 1..32 { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index d65fcdbeed5..4549c79857a 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -373,10 +373,9 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.vm.common.resource_allocator.mmio_bus.clone()); + vcpu.set_mmio_bus(self.vm.common.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] - vcpu.kvm_vcpu - .set_pio_bus(self.vm.common.resource_allocator.pio_bus.clone()); + vcpu.kvm_vcpu.set_pio_bus(self.vm.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index 3b77b892bc3..3d8d8016e97 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -4,11 +4,9 @@ use std::convert::Infallible; use std::sync::{Arc, Mutex}; -use pci::DeviceRelocation; use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; -use vm_device::Bus; use crate::arch; use crate::snapshot::Persist; @@ -30,11 +28,6 @@ pub struct ResourceAllocator { pub mmio64_memory: Arc>, /// Memory allocator for system data pub system_memory: Arc>, - /// MMIO bus - pub mmio_bus: Arc, - #[cfg(target_arch = "x86_64")] - /// Port IO bus - pub pio_bus: Arc, } impl ResourceAllocator { @@ -54,9 +47,6 @@ impl ResourceAllocator { arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE, )?)), - mmio_bus: Arc::new(Bus::new()), - #[cfg(target_arch = "x86_64")] - pio_bus: Arc::new(Bus::new()), }) } @@ -178,9 +168,6 @@ impl<'a> Persist<'a> for ResourceAllocator { mmio32_memory: state.mmio32_memory.clone(), mmio64_memory: state.mmio64_memory.clone(), system_memory: state.system_memory.clone(), - mmio_bus: Arc::new(Bus::new()), - #[cfg(target_arch = "x86_64")] - pio_bus: Arc::new(Bus::new()), }) } } @@ -219,19 +206,6 @@ impl Default for ResourceAllocatorState { } } -impl DeviceRelocation for ResourceAllocator { - fn move_bar( - &self, - _old_base: u64, - _new_base: u64, - _len: u64, - _pci_dev: &mut dyn pci::PciDevice, - _region_type: pci::PciBarRegionType, - ) -> Result<(), std::io::Error> { - todo!() - } -} - #[cfg(test)] mod tests { use vm_allocator::AllocPolicy; diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index a2c3a65be6b..6bdfad5e37b 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -19,7 +19,8 @@ use kvm_bindings::{ KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, }; use kvm_ioctls::VmFd; -use log::debug; +use log::{debug, error}; +use pci::DeviceRelocation; use serde::{Deserialize, Serialize}; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, @@ -246,6 +247,8 @@ pub struct VmCommon { pub interrupts: Mutex>, /// Allocator for VM resources pub resource_allocator: Arc, + /// MMIO bus + pub mmio_bus: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -317,6 +320,7 @@ impl Vm { guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), resource_allocator: Arc::new(ResourceAllocator::new()?), + mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -603,6 +607,20 @@ impl Vm { } } +impl DeviceRelocation for Vm { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + error!("pci: device relocation not supported"); + Err(std::io::Error::from(std::io::ErrorKind::Unsupported)) + } +} + #[cfg(test)] pub(crate) mod tests { use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; From d148cc5d5ccd2236310016ec5f2194bcf3601e32 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 18 Jun 2025 23:39:14 +0200 Subject: [PATCH 21/27] arm: support MSI-X on ARM Add support for ITS device which provides support for MSI interrupts on ARM architecture. This is currently supported only on systems with GICv3 interrupt controller. In order to make saving/restore of ITS state work properly, we need to change the order in which we restore redistributor register GICR_CTLR. We need to make sure that this register is restored last. Otherwise, restoring GICR_PROPBASER doesn't have any effect and ITS depends on it in order to save/restore ITS tables to/from guest memory. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 30 ++++ src/vmm/src/arch/aarch64/gic/gicv2/mod.rs | 4 +- .../src/arch/aarch64/gic/gicv2/regs/mod.rs | 1 + src/vmm/src/arch/aarch64/gic/gicv3/mod.rs | 88 +++++++++--- .../arch/aarch64/gic/gicv3/regs/its_regs.rs | 135 ++++++++++++++++++ .../src/arch/aarch64/gic/gicv3/regs/mod.rs | 48 +++++-- .../aarch64/gic/gicv3/regs/redist_regs.rs | 2 +- src/vmm/src/arch/aarch64/gic/mod.rs | 22 +++ src/vmm/src/arch/aarch64/gic/regs.rs | 3 + src/vmm/src/arch/aarch64/output_GICv3.dtb | Bin 2097152 -> 2097152 bytes .../src/arch/aarch64/output_initrd_GICv3.dtb | Bin 2097152 -> 2097152 bytes 11 files changed, 298 insertions(+), 35 deletions(-) create mode 100644 src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 0073d7dbc05..a4cf14b52d7 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -28,6 +28,8 @@ use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; const GIC_PHANDLE: u32 = 1; // This is a value for uniquely identifying the FDT node containing the clock definition. const CLOCK_PHANDLE: u32 = 2; +// This is a value for uniquely identifying the FDT node declaring the MSI controller. +const MSI_PHANDLE: u32 = 3; // You may be wondering why this big value? // This phandle is used to uniquely identify the FDT nodes containing cache information. Each cpu // can have a variable number of caches, some of these caches may be shared with other cpus. @@ -302,6 +304,16 @@ fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), Fd ]; fdt.property_array_u32("interrupts", &gic_intr)?; + + if let Some(msi_properties) = gic_device.msi_properties() { + let msic_node = fdt.begin_node("msic")?; + fdt.property_string("compatible", "arm,gic-v3-its")?; + fdt.property_null("msi-controller")?; + fdt.property_u32("phandle", MSI_PHANDLE)?; + fdt.property_array_u64("reg", msi_properties)?; + fdt.end_node(msic_node)?; + } + fdt.end_node(interrupt)?; Ok(()) @@ -471,6 +483,21 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, ]; + + // See kernel document Documentation/devicetree/bindings/pci/pci-msi.txt + let msi_map = [ + // rid-base: A single cell describing the first RID matched by the entry. + 0x0, + // msi-controller: A single phandle to an MSI controller. + MSI_PHANDLE, + // msi-base: An msi-specifier describing the msi-specifier produced for the + // first RID matched by the entry. + segment.id as u32, + // length: A single cell describing how many consecutive RIDs are matched + // following the rid-base. + 0x100, + ]; + let pci_node = fdt.begin_node(&pci_node_name)?; fdt.property_string("compatible", "pci-host-ecam-generic")?; @@ -491,6 +518,9 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), fdt.property_null("interrupt-map")?; fdt.property_null("interrupt-map-mask")?; fdt.property_null("dma-coherent")?; + fdt.property_array_u32("msi-map", &msi_map)?; + fdt.property_u32("msi-parent", MSI_PHANDLE)?; + Ok(fdt.end_node(pci_node)?) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index c4b9208a0a6..dfa2302d6be 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -68,7 +68,9 @@ impl GICv2 { GICv2::get_cpu_addr(), GICv2::get_cpu_size(), ], + msi_properties: None, vcpu_count, + its_device: None, }) } @@ -82,7 +84,7 @@ impl GICv2 { pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs index 8bb26ce2bcd..2b617716fe2 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs @@ -22,6 +22,7 @@ pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { Ok(GicState { dist: dist_regs::get_dist_regs(fd)?, gic_vcpu_states: vcpu_states, + ..Default::default() }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 39c4e5ce148..075687bc23e 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -1,7 +1,7 @@ // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod regs; +pub mod regs; use kvm_ioctls::{DeviceFd, VmFd}; @@ -18,12 +18,19 @@ impl std::ops::Deref for GICv3 { } } +impl std::ops::DerefMut for GICv3 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl GICv3 { // Unfortunately bindgen omits defines that are based on other defines. // See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. const SZ_64K: u64 = 0x0001_0000; const KVM_VGIC_V3_DIST_SIZE: u64 = GICv3::SZ_64K; const KVM_VGIC_V3_REDIST_SIZE: u64 = (2 * GICv3::SZ_64K); + const GIC_V3_ITS_SIZE: u64 = 0x2_0000; // Device trees specific constants const ARCH_GIC_V3_MAINT_IRQ: u32 = 9; @@ -48,6 +55,16 @@ impl GICv3 { vcpu_count * GICv3::KVM_VGIC_V3_REDIST_SIZE } + /// Get the MSI address + fn get_msi_address(vcpu_count: u64) -> u64 { + Self::get_redists_addr(vcpu_count) - GICv3::GIC_V3_ITS_SIZE + } + + /// Get the MSI size + const fn get_msi_size() -> u64 { + GICv3::GIC_V3_ITS_SIZE + } + pub const VERSION: u32 = kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3; pub fn fdt_compatibility(&self) -> &str { @@ -59,30 +76,43 @@ impl GICv3 { } /// Create the GIC device object - pub fn create_device(fd: DeviceFd, vcpu_count: u64) -> Self { - GICv3(super::GIC { - fd, + pub fn create_device(vm: &VmFd, vcpu_count: u64) -> Result { + // Create the GIC device + let mut gic_device = kvm_bindings::kvm_create_device { + type_: Self::VERSION, + fd: 0, + flags: 0, + }; + + let gic_fd = vm + .create_device(&mut gic_device) + .map_err(GicError::CreateGIC)?; + + Ok(GICv3(super::GIC { + fd: gic_fd, properties: [ GICv3::get_dist_addr(), GICv3::get_dist_size(), GICv3::get_redists_addr(vcpu_count), GICv3::get_redists_size(vcpu_count), ], + msi_properties: Some([GICv3::get_msi_address(vcpu_count), GICv3::get_msi_size()]), vcpu_count, - }) + its_device: None, + })) } pub fn save_device(&self, mpidrs: &[u64]) -> Result { - regs::save_state(&self.fd, mpidrs) + regs::save_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs) } pub fn restore_device(&self, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - regs::restore_state(&self.fd, mpidrs, state) + regs::restore_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs, state) } pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -104,25 +134,45 @@ impl GICv3 { Ok(()) } - /// Initialize a GIC device - pub fn init_device(vm: &VmFd) -> Result { - let mut gic_device = kvm_bindings::kvm_create_device { - type_: Self::VERSION, + fn init_its(vm: &VmFd, gic_device: &mut Self) -> Result<(), GicError> { + // ITS part attributes + let mut its_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, fd: 0, flags: 0, }; - vm.create_device(&mut gic_device) - .map_err(GicError::CreateGIC) + let its_fd = vm + .create_device(&mut its_device) + .map_err(GicError::CreateGIC)?; + + // Setting up the ITS attributes + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + &Self::get_msi_address(gic_device.vcpu_count()) as *const u64 as u64, + 0, + )?; + + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + 0, + 0, + )?; + + gic_device.its_device = Some(its_fd); + Ok(()) } /// Method to initialize the GIC device pub fn create(vm: &VmFd, vcpu_count: u64) -> Result { - let vgic_fd = Self::init_device(vm)?; - - let device = Self::create_device(vgic_fd, vcpu_count); + let mut device = Self::create_device(vm, vcpu_count)?; Self::init_device_attributes(&device)?; + Self::init_its(vm, &mut device)?; Self::finalize_device(&device)?; @@ -184,14 +234,14 @@ impl GICv3 { /// RDIST pending tables into guest RAM. /// /// The tables get flushed to guest RAM whenever the VM gets stopped. -fn save_pending_tables(fd: &DeviceFd) -> Result<(), GicError> { +fn save_pending_tables(gic_device: &DeviceFd) -> Result<(), GicError> { let init_gic_attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, attr: u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES), addr: 0, flags: 0, }; - fd.set_device_attr(&init_gic_attr).map_err(|err| { + gic_device.set_device_attr(&init_gic_attr).map_err(|err| { GicError::DeviceAttribute(err, true, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL) }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs new file mode 100644 index 00000000000..ee4ecafba1e --- /dev/null +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs @@ -0,0 +1,135 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{ + KVM_DEV_ARM_ITS_RESTORE_TABLES, KVM_DEV_ARM_ITS_SAVE_TABLES, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_GRP_ITS_REGS, +}; +use kvm_ioctls::DeviceFd; +use serde::{Deserialize, Serialize}; + +use crate::arch::aarch64::gic::GicError; + +// ITS registers that we want to preserve across snapshots +const GITS_CTLR: u32 = 0x0000; +const GITS_IIDR: u32 = 0x0004; +const GITS_CBASER: u32 = 0x0080; +const GITS_CWRITER: u32 = 0x0088; +const GITS_CREADR: u32 = 0x0090; +const GITS_BASER: u32 = 0x0100; + +fn set_device_attribute( + its_device: &DeviceFd, + group: u32, + attr: u32, + val: u64, +) -> Result<(), GicError> { + let gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &val as *const u64 as u64, + flags: 0, + }; + + its_device + .set_device_attr(&gicv3_its_attr) + .map_err(|err| GicError::DeviceAttribute(err, true, group)) +} + +fn get_device_attribute(its_device: &DeviceFd, group: u32, attr: u32) -> Result { + let mut val = 0; + + let mut gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &mut val as *mut u64 as u64, + flags: 0, + }; + + // SAFETY: gicv3_its_attr.addr is safe to write to. + unsafe { its_device.get_device_attr(&mut gicv3_its_attr) } + .map_err(|err| GicError::DeviceAttribute(err, false, group))?; + + Ok(val) +} + +fn its_read_register(its_fd: &DeviceFd, attr: u32) -> Result { + get_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr) +} + +fn its_set_register(its_fd: &DeviceFd, attr: u32, val: u64) -> Result<(), GicError> { + set_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr, val) +} + +pub fn its_save_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_SAVE_TABLES, + 0, + ) +} + +pub fn its_restore_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_RESTORE_TABLES, + 0, + ) +} + +/// ITS registers that we save/restore during snapshot +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct ItsRegisterState { + iidr: u64, + cbaser: u64, + creadr: u64, + cwriter: u64, + baser: [u64; 8], + ctlr: u64, +} + +impl ItsRegisterState { + /// Save ITS state + pub fn save(its_fd: &DeviceFd) -> Result { + let mut state = ItsRegisterState::default(); + + for i in 0..8 { + state.baser[i as usize] = its_read_register(its_fd, GITS_BASER + i * 8)?; + } + state.ctlr = its_read_register(its_fd, GITS_CTLR)?; + state.cbaser = its_read_register(its_fd, GITS_CBASER)?; + state.creadr = its_read_register(its_fd, GITS_CREADR)?; + state.cwriter = its_read_register(its_fd, GITS_CWRITER)?; + state.iidr = its_read_register(its_fd, GITS_IIDR)?; + + Ok(state) + } + + /// Restore ITS state + /// + /// We need to restore ITS registers in a very specific order for things to work. Take a look + /// at: + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L60 + /// and + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L123 + /// + /// for more details, but TL;DR is: + /// + /// We need to restore GITS_CBASER, GITS_CREADER, GITS_CWRITER, GITS_BASER and GITS_IIDR + /// registers before restoring ITS tables from guest memory. We also need to set GITS_CTLR + /// last. + pub fn restore(&self, its_fd: &DeviceFd) -> Result<(), GicError> { + its_set_register(its_fd, GITS_IIDR, self.iidr)?; + its_set_register(its_fd, GITS_CBASER, self.cbaser)?; + its_set_register(its_fd, GITS_CREADR, self.creadr)?; + its_set_register(its_fd, GITS_CWRITER, self.cwriter)?; + for i in 0..8 { + its_set_register(its_fd, GITS_BASER + i * 8, self.baser[i as usize])?; + } + // We need to restore saved ITS tables before restoring GITS_CTLR + its_restore_tables(its_fd)?; + its_set_register(its_fd, GITS_CTLR, self.ctlr) + } +} diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs index 0531766dc54..3df0d4642d7 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs @@ -3,45 +3,63 @@ mod dist_regs; mod icc_regs; +pub mod its_regs; mod redist_regs; +use its_regs::{ItsRegisterState, its_save_tables}; use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicState, GicVcpuState}; /// Save the state of the GIC device. -pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { +pub fn save_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], +) -> Result { // Flush redistributors pending tables to guest RAM. - super::save_pending_tables(fd)?; + super::save_pending_tables(gic_device)?; + // Flush ITS tables into guest memory. + its_save_tables(its_device)?; let mut vcpu_states = Vec::with_capacity(mpidrs.len()); for mpidr in mpidrs { vcpu_states.push(GicVcpuState { - rdist: redist_regs::get_redist_regs(fd, *mpidr)?, - icc: icc_regs::get_icc_regs(fd, *mpidr)?, + rdist: redist_regs::get_redist_regs(gic_device, *mpidr)?, + icc: icc_regs::get_icc_regs(gic_device, *mpidr)?, }) } + let its_state = ItsRegisterState::save(its_device)?; + Ok(GicState { - dist: dist_regs::get_dist_regs(fd)?, + dist: dist_regs::get_dist_regs(gic_device)?, gic_vcpu_states: vcpu_states, + its_state: Some(its_state), }) } /// Restore the state of the GIC device. -pub fn restore_state(fd: &DeviceFd, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - dist_regs::set_dist_regs(fd, &state.dist)?; +pub fn restore_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], + state: &GicState, +) -> Result<(), GicError> { + dist_regs::set_dist_regs(gic_device, &state.dist)?; if mpidrs.len() != state.gic_vcpu_states.len() { return Err(GicError::InconsistentVcpuCount); } for (mpidr, vcpu_state) in mpidrs.iter().zip(&state.gic_vcpu_states) { - redist_regs::set_redist_regs(fd, *mpidr, &vcpu_state.rdist)?; - icc_regs::set_icc_regs(fd, *mpidr, &vcpu_state.icc)?; + redist_regs::set_redist_regs(gic_device, *mpidr, &vcpu_state.rdist)?; + icc_regs::set_icc_regs(gic_device, *mpidr, &vcpu_state.icc)?; } - Ok(()) + // Safe to unwrap here, as we know we support an ITS device, so `its_state.is_some()` is always + // `true`. + state.its_state.as_ref().unwrap().restore(its_device) } #[cfg(test)] @@ -59,9 +77,10 @@ mod tests { let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); let mpidr = vec![1]; - let res = save_state(gic_fd, &mpidr); + let res = save_state(gic_fd, its_fd, &mpidr); // We will receive an error if trying to call before creating vcpu. assert_eq!( format!("{:?}", res.unwrap_err()), @@ -73,8 +92,9 @@ mod tests { let _vcpu = vm.create_vcpu(0).unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); - let vm_state = save_state(gic_fd, &mpidr).unwrap(); + let vm_state = save_state(gic_fd, its_fd, &mpidr).unwrap(); let val: u32 = 0; let gicd_statusr_off = 0x0010u64; let mut gic_dist_attr = kvm_bindings::kvm_device_attr { @@ -94,7 +114,7 @@ mod tests { assert_eq!(gicd_statusr.chunks[0], val); assert_eq!(vm_state.dist.len(), 12); - restore_state(gic_fd, &mpidr, &vm_state).unwrap(); - restore_state(gic_fd, &[1, 2], &vm_state).unwrap_err(); + restore_state(gic_fd, its_fd, &mpidr, &vm_state).unwrap(); + restore_state(gic_fd, its_fd, &[1, 2], &vm_state).unwrap_err(); } } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs index 4d1ba3292c1..96aaebc87bd 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs @@ -28,11 +28,11 @@ const GICR_ICFGR0: SimpleReg = SimpleReg::new(GICR_SGI_OFFSET + 0x0C00, 8); // List with relevant redistributor registers that we will be restoring. static VGIC_RDIST_REGS: &[SimpleReg] = &[ - GICR_CTLR, GICR_STATUSR, GICR_WAKER, GICR_PROPBASER, GICR_PENDBASER, + GICR_CTLR, ]; // List with relevant SGI associated redistributor registers that we will be restoring. diff --git a/src/vmm/src/arch/aarch64/gic/mod.rs b/src/vmm/src/arch/aarch64/gic/mod.rs index cda423f478c..9bfabee1fea 100644 --- a/src/vmm/src/arch/aarch64/gic/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/mod.rs @@ -21,8 +21,14 @@ pub struct GIC { /// GIC device properties, to be used for setting up the fdt entry properties: [u64; 4], + /// MSI properties of the GIC device + msi_properties: Option<[u64; 2]>, + /// Number of CPUs handled by the device vcpu_count: u64, + + /// ITS device + its_device: Option, } impl GIC { /// Returns the file descriptor of the GIC device @@ -80,6 +86,14 @@ impl GICDevice { } } + /// Returns the file descriptor of the ITS device, if any + pub fn its_fd(&self) -> Option<&DeviceFd> { + match self { + Self::V2(_) => None, + Self::V3(x) => x.its_device.as_ref(), + } + } + /// Returns an array with GIC device properties pub fn device_properties(&self) -> &[u64] { match self { @@ -88,6 +102,14 @@ impl GICDevice { } } + /// Returns an array with MSI properties if GIC supports it + pub fn msi_properties(&self) -> Option<&[u64; 2]> { + match self { + Self::V2(x) => x.msi_properties.as_ref(), + Self::V3(x) => x.msi_properties.as_ref(), + } + } + /// Returns the number of vCPUs this GIC handles pub fn vcpu_count(&self) -> u64 { match self { diff --git a/src/vmm/src/arch/aarch64/gic/regs.rs b/src/vmm/src/arch/aarch64/gic/regs.rs index 60987cc973d..1afa7acde9c 100644 --- a/src/vmm/src/arch/aarch64/gic/regs.rs +++ b/src/vmm/src/arch/aarch64/gic/regs.rs @@ -10,6 +10,7 @@ use kvm_ioctls::DeviceFd; use serde::{Deserialize, Serialize}; use crate::arch::aarch64::gic::GicError; +use crate::arch::aarch64::gic::gicv3::regs::its_regs::ItsRegisterState; #[derive(Debug, Serialize, Deserialize)] pub struct GicRegState { @@ -30,6 +31,8 @@ pub struct GicState { pub dist: Vec>, /// The state of the vcpu interfaces. pub gic_vcpu_states: Vec, + /// The state of the ITS device. Only present with GICv3. + pub its_state: Option, } /// Structure used for serializing the state of the GIC registers for a specific vCPU. diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 03fba87f4fedcb57536d5219315cbe6474adc7b9..35f4e9b63a35caa91b793f37e857fe3ae1c3f3aa 100644 GIT binary patch delta 357 zcmYMuy-EW?5Ww->yIVDCl1t)O{5VOMB3B72SO`|O;u9Q)%0WKhE?8MCSmXir0ek@) zLGT&USy*Umt=KE5|49S~emk=>%(B*6ZP9)#f4?hb5~b9>Qo%8~jT%A_x=BBDEz`I= zxbhO+E3$MtQ9ijUb&~fw7g-u#?!?)!m`r6RF^1Fu&i`t;%j+jlhhK6M-t=_$-kyGT zZ(h-Qopk>9KXiXrTC!ilkF)-%uDd7yrNX!3bkHB>gKqaMcelaOb!0}7(eU6Sfb?cC oi#e1qj|D8Ej0&mpamDL6FH?Hyqx%cf!So?tb`a45xjxq1YuB_ArGB|tOs0pfWAT&Zd|zX z8XmwaxEGD$-xIOm>nf_6qOvTL49(Zl&%09QNh#G<%D*KyQ9~7koY8OFI}<#gPQ9-5 zM{zWtC{3=ox|{ZoZsI7M9Eb6EluzBv1>5s~KKc5ub~CsA66*ZRy@WS^?VEQZGh5f` zp%n7g{{jhQ&~h!epvY+r`=~q8vNQc%=B_Xr4#w%Q*SkvXhktROn1{p&JZSiEy#f}n nh$5D-j1`nnMg>*Wu!;b6tYIA+*u)mLv4dUgVgDsD2c6GfW|v3F delta 249 zcmXBKy9ok86ouhC&Wz9X{eFXqMn)o{kywHaSb!K9nJg@9U;)#D4Qym;2?n-dEC`DK z;)RdHIou!!RDj7n|9y(srHCvgi7UIFi@@K{RQ2 zH0m&qr?_Y6HRqYF8oQnOxV>G6zCHU^Aps48cuOLMG&0B{2M>7^P(%r3R8U0?bu`dK U3vG1JMGt)puD%_HZ+EksA4L*6$N&HU From 7053fb39d832cdfc05c022492e8f9a7eb68e53b7 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 17 Jun 2025 13:13:26 +0200 Subject: [PATCH 22/27] test: VirtIO PCI device create and restoration Refactor the test code that inserts VirtIO devices in a Vmm object and then add a test which creates a Vmm with PCI devices and then serializes and deserializes the device manager and ensures that everything is as restored as expected. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 12 -- src/vmm/src/device_manager/mod.rs | 30 ++++ src/vmm/src/device_manager/pci_mngr.rs | 184 +++++++++++++++++++++++++ 3 files changed, 214 insertions(+), 12 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 4b998fdf138..e196ef505c2 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -848,7 +848,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); @@ -874,7 +873,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); @@ -909,7 +907,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); @@ -961,7 +958,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -983,7 +979,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1006,7 +1001,6 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1044,19 +1038,16 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1086,7 +1077,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1108,7 +1098,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1130,7 +1119,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 0b610ccf454..fae7d12748d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -375,6 +375,36 @@ impl DeviceManager { Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); } } + + /// Get a VirtIO device of type `virtio_type` with ID `device_id` + pub fn get_virtio_device( + &self, + virtio_type: u32, + device_id: &str, + ) -> Option>> { + if self.pci_devices.pci_segment.is_some() { + let pci_device = self.pci_devices.get_virtio_device(virtio_type, device_id)?; + Some( + pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .clone(), + ) + } else { + let mmio_device = self + .mmio_devices + .get_virtio_device(virtio_type, device_id)?; + Some( + mmio_device + .inner + .lock() + .expect("Poisoned lock") + .device() + .clone(), + ) + } + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 303a34a3448..5c09085e84d 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -585,3 +585,187 @@ impl<'a> Persist<'a> for PciDevices { Ok(pci_devices) } } + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::builder::tests::*; + use crate::device_manager; + use crate::devices::virtio::block::CacheType; + use crate::mmds::data_store::MmdsVersion; + use crate::resources::VmmConfig; + use crate::snapshot::Snapshot; + use crate::vmm_config::balloon::BalloonDeviceConfig; + use crate::vmm_config::entropy::EntropyDeviceConfig; + use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::vsock::VsockDeviceConfig; + + #[test] + fn test_device_manager_persistence() { + let mut buf = vec![0; 65536]; + // These need to survive so the restored blocks find them. + let _block_files; + let mut tmp_sock_file = TempFile::new().unwrap(); + tmp_sock_file.remove().unwrap(); + // Set up a vmm with one of each device, and get the serialized DeviceStates. + { + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let mut cmdline = default_kernel_cmdline(); + + // Add a balloon device. + let balloon_cfg = BalloonDeviceConfig { + amount_mib: 123, + deflate_on_oom: false, + stats_polling_interval_s: 1, + }; + insert_balloon_device(&mut vmm, &mut cmdline, &mut event_manager, balloon_cfg); + // Add a block device. + let drive_id = String::from("root"); + let block_configs = vec![CustomBlockConfig::new( + drive_id, + true, + None, + true, + CacheType::Unsafe, + )]; + _block_files = + insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); + // Add a net device. + let network_interface = NetworkInterfaceConfig { + iface_id: String::from("netif"), + host_dev_name: String::from("hostname"), + guest_mac: None, + rx_rate_limiter: None, + tx_rate_limiter: None, + }; + insert_net_device_with_mmds( + &mut vmm, + &mut cmdline, + &mut event_manager, + network_interface, + MmdsVersion::V2, + ); + // Add a vsock device. + let vsock_dev_id = "vsock"; + let vsock_config = VsockDeviceConfig { + vsock_id: Some(vsock_dev_id.to_string()), + guest_cid: 3, + uds_path: tmp_sock_file.as_path().to_str().unwrap().to_string(), + }; + insert_vsock_device(&mut vmm, &mut cmdline, &mut event_manager, vsock_config); + // Add an entropy device. + let entropy_config = EntropyDeviceConfig::default(); + insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); + + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } + + tmp_sock_file.remove().unwrap(); + + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + // Keep in mind we are re-creating here an empty DeviceManager. Restoring later on + // will create a new PciDevices manager different than vmm.pci_devices. We're doing + // this to avoid restoring the whole Vmm, since what we really need from Vmm is the Vm + // object and calling default_vmm() is the easiest way to create one. + let vmm = default_vmm(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let vm_resources = &mut VmResources::default(); + let restore_args = PciDevicesConstructorArgs { + vm: vmm.vm.clone(), + mem: vmm.vm.guest_memory(), + vm_resources, + instance_id: "microvm-id", + restored_from_file: true, + event_manager: &mut event_manager, + }; + let _restored_dev_manager = + PciDevices::restore(restore_args, &device_manager_state.pci_state).unwrap(); + + let expected_vm_resources = format!( + r#"{{ + "balloon": {{ + "amount_mib": 123, + "deflate_on_oom": false, + "stats_polling_interval_s": 1 + }}, + "drives": [ + {{ + "drive_id": "root", + "partuuid": null, + "is_root_device": true, + "cache_type": "Unsafe", + "is_read_only": true, + "path_on_host": "{}", + "rate_limiter": null, + "io_engine": "Sync", + "socket": null + }} + ], + "boot-source": {{ + "kernel_image_path": "", + "initrd_path": null, + "boot_args": null + }}, + "cpu-config": null, + "logger": null, + "machine-config": {{ + "vcpu_count": 1, + "mem_size_mib": 128, + "smt": false, + "track_dirty_pages": false, + "huge_pages": "None" + }}, + "metrics": null, + "mmds-config": {{ + "version": "V2", + "network_interfaces": [ + "netif" + ], + "ipv4_address": "169.254.169.254" + }}, + "network-interfaces": [ + {{ + "iface_id": "netif", + "host_dev_name": "hostname", + "guest_mac": null, + "rx_rate_limiter": null, + "tx_rate_limiter": null + }} + ], + "vsock": {{ + "guest_cid": 3, + "uds_path": "{}" + }}, + "entropy": {{ + "rate_limiter": null + }} +}}"#, + _block_files.last().unwrap().as_path().to_str().unwrap(), + tmp_sock_file.as_path().to_str().unwrap() + ); + + assert_eq!( + vm_resources + .mmds + .as_ref() + .unwrap() + .lock() + .unwrap() + .version(), + MmdsVersion::V2 + ); + assert_eq!( + device_manager_state.pci_state.mmds_version.unwrap(), + MmdsVersion::V2.into() + ); + assert_eq!( + expected_vm_resources, + serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() + ); + } +} From d2cf9abf3591e46c3ad2a185438a791ce7e4e5c6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 20 Jun 2025 14:14:58 +0200 Subject: [PATCH 23/27] test: enable PCI microVMs for performance testing Use pci_enabled fixture for boot time, block, and network tests to create PCI microVM variants as well. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 1 + .../performance/test_block_ab.py | 3 ++- .../performance/test_boottime.py | 25 ++++++++++++++----- .../performance/test_network_ab.py | 4 +-- .../performance/test_vsock_ab.py | 3 ++- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index a1f46fd89c2..887939b5bf8 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -481,6 +481,7 @@ def dimensions(self): "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + "pci": f"{self.pci_enabled}", } @property diff --git a/tests/integration_tests/performance/test_block_ab.py b/tests/integration_tests/performance/test_block_ab.py index dfd0728084a..7fe9216e559 100644 --- a/tests/integration_tests/performance/test_block_ab.py +++ b/tests/integration_tests/performance/test_block_ab.py @@ -168,6 +168,7 @@ def test_block_performance( fio_mode, fio_block_size, fio_engine, + pci_enabled, io_engine, metrics, results_dir, @@ -176,7 +177,7 @@ def test_block_performance( Execute block device emulation benchmarking scenarios. """ vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() # Add a secondary block device for benchmark tests. diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 7708451ec7f..4eb9a267475 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -95,12 +95,12 @@ def to_ms(v, unit): def launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, @@ -116,9 +116,11 @@ def launch_vm_with_boot_timer( return (vm, boot_time_us, cpu_boot_time_us) -def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): +def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Tests that the boot timer device works""" - launch_vm_with_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, 1, 128) + launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs, 1, 128, pci_enabled + ) @pytest.mark.parametrize( @@ -127,13 +129,24 @@ def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): ) @pytest.mark.nonci def test_boottime( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, + metrics, ): """Test boot time with different guest configurations""" for i in range(10): vm, boot_time_us, cpu_boot_time_us = launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, ) if i == 0: diff --git a/tests/integration_tests/performance/test_network_ab.py b/tests/integration_tests/performance/test_network_ab.py index 3355d54c2bc..4c2deba0041 100644 --- a/tests/integration_tests/performance/test_network_ab.py +++ b/tests/integration_tests/performance/test_network_ab.py @@ -38,7 +38,7 @@ def consume_ping_output(ping_putput, request_per_round): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" @@ -46,7 +46,7 @@ def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): guest_vcpus = request.param vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) vm.add_net_iface() vm.start() diff --git a/tests/integration_tests/performance/test_vsock_ab.py b/tests/integration_tests/performance/test_vsock_ab.py index bad4436e568..5a023f53eea 100644 --- a/tests/integration_tests/performance/test_vsock_ab.py +++ b/tests/integration_tests/performance/test_vsock_ab.py @@ -80,6 +80,7 @@ def test_vsock_throughput( rootfs, vcpus, payload_length, + pci_enabled, mode, metrics, results_dir, @@ -95,7 +96,7 @@ def test_vsock_throughput( mem_size_mib = 1024 vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) vm.add_net_iface() # Create a vsock device From fcdeb7a7a1e965025c4a36e0c18aaf9b9a33f840 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 25 Jun 2025 10:00:15 +0200 Subject: [PATCH 24/27] test: remove pci=off default from various parts in tests We only pass pci=off if PCI is disabled in Firecracker. Adapt tests and comments to reflect that. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 6 ++++-- tests/integration_tests/performance/test_boottime.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 887939b5bf8..ac9b3c077eb 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -784,8 +784,10 @@ def basic_config( the response is within the interval [200, 300). If boot_args is None, the default boot_args in Firecracker is - reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 - i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd + reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux + i8042.nopnp i8042.dumbkbd + + if PCI is disabled, Firecracker also passes to the guest pci=off Reference: file:../../src/vmm/src/vmm_config/boot_source.rs::DEFAULT_KERNEL_CMDLINE """ diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 4eb9a267475..173e352f67d 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -11,7 +11,7 @@ # Regex for obtaining boot time from some string. DEFAULT_BOOT_ARGS = ( - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0" + "reboot=k panic=1 nomodule 8250.nr_uarts=0" " i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd" ) @@ -98,13 +98,14 @@ def launch_vm_with_boot_timer( microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" + boot_args = DEFAULT_BOOT_ARGS if pci_enabled else DEFAULT_BOOT_ARGS + " pci=off" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, - boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", + boot_args=boot_args + " init=/usr/local/bin/init", enable_entropy_device=True, ) vm.add_net_iface() From 0c940cd0a77abcd56d49fbd20ece7f97a2b5be24 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 30 Jun 2025 12:26:48 +0200 Subject: [PATCH 25/27] virtio: add kick() method in VirtioDevice trait So that we don't have to downcast VirtioDevice trait objects to the actual device type before calling the logic to process events for each device. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mod.rs | 89 +++----------------- src/vmm/src/devices/virtio/balloon/device.rs | 12 ++- src/vmm/src/devices/virtio/block/device.rs | 13 +++ src/vmm/src/devices/virtio/device.rs | 3 + src/vmm/src/devices/virtio/net/device.rs | 13 ++- src/vmm/src/devices/virtio/rng/device.rs | 8 ++ src/vmm/src/devices/virtio/vsock/device.rs | 15 +++- 7 files changed, 74 insertions(+), 79 deletions(-) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index fae7d12748d..9237538fd60 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -30,14 +30,8 @@ use crate::devices::legacy::RTCDevice; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -265,85 +259,28 @@ impl DeviceManager { self.pci_devices.attach_pci_segment(vm) } - fn do_kick_device(virtio_device: Arc>) { - let mut device = virtio_device.lock().expect("Poisoned lock"); - match device.device_type() { - TYPE_BALLOON => { - let balloon = device.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", balloon.id()); - balloon.process_virtio_queues(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = device.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", block.id()); - block.process_virtio_queues(); - } - } - } - TYPE_NET => { - let net = device.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", net.id()); - net.process_virtio_queues(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = device - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {}.", vsock.id()); - vsock.signal_used_queue(0).unwrap(); - } - } - TYPE_RNG => { - let entropy = device.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {}.", entropy.id()); - entropy.process_virtio_queues(); - } - } - _ => (), - } - } - /// Artificially kick VirtIO devices as if they had external events. pub fn kick_virtio_devices(&self) { info!("Artificially kick devices"); // Go through MMIO VirtIO devices let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); - Self::do_kick_device(mmio_transport_locked.device()); + mmio_transport_locked + .device() + .lock() + .expect("Poisoned lock") + .kick(); Ok(()) }); // Go through PCI VirtIO devices - for device in self.pci_devices.virtio_devices.values() { - let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); - Self::do_kick_device(virtio_device); + for virtio_pci_device in self.pci_devices.virtio_devices.values() { + virtio_pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .lock() + .expect("Poisoned lock") + .kick(); } } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 3927b7e0aef..754c25bdcba 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -5,7 +5,7 @@ use std::ops::Deref; use std::sync::Arc; use std::time::Duration; -use log::error; +use log::{error, info}; use serde::Serialize; use timerfd::{ClockId, SetTimeFlags, TimerFd, TimerState}; use vmm_sys_util::eventfd::EventFd; @@ -615,6 +615,16 @@ impl VirtioDevice for Balloon { fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if self.is_activated() { + info!("kick balloon {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index f2f797e60f3..a7141773858 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use event_manager::{EventOps, Events, MutEventSubscriber}; +use log::info; use vmm_sys_util::eventfd::EventFd; use super::BlockError; @@ -214,6 +215,18 @@ impl VirtioDevice for Block { Self::VhostUser(b) => b.device_state.is_activated(), } } + + fn kick(&mut self) { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick block {}.", self.id()); + self.process_virtio_queues(); + } + } } impl MutEventSubscriber for Block { diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 49ac1802447..6e44c3f00fa 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -159,6 +159,9 @@ pub trait VirtioDevice: AsAny + Send { } Ok(()) } + + /// Kick the device, as if it had received external events. + fn kick(&mut self) {} } impl fmt::Debug for dyn VirtioDevice { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index bf7a91e21f3..217d8ce9498 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -13,7 +13,7 @@ use std::ops::Deref; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; -use log::error; +use log::{error, info}; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; @@ -1053,6 +1053,17 @@ impl VirtioDevice for Net { fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick net {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 937e113d096..9dbd66d5156 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -6,6 +6,7 @@ use std::ops::Deref; use std::sync::Arc; use aws_lc_rs::rand; +use log::info; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; @@ -309,6 +310,13 @@ impl VirtioDevice for Entropy { self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } + + fn kick(&mut self) { + if self.is_activated() { + info!("kick entropy {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index c9daf19fd94..e3b003f498e 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -24,7 +24,7 @@ use std::fmt::Debug; use std::ops::Deref; use std::sync::Arc; -use log::{error, warn}; +use log::{error, info, warn}; use vmm_sys_util::eventfd::EventFd; use super::super::super::DeviceError; @@ -370,6 +370,19 @@ where fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + if self.is_activated() { + info!("kick vsock {}.", self.id()); + self.signal_used_queue(0).unwrap(); + } + } } #[cfg(test)] From 4f3917022d847435c524794ea1da351c5111fb65 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 30 Jun 2025 18:33:40 +0200 Subject: [PATCH 26/27] refactor: simplify ResourceAllocator internals Instead of storing internal allocators of ResourceAllocator within an Arc> container, just store `ResourceAllocator` itself in an `Arc>`. Apart from that, we get rid of the `ResourceAllocatorState` state object, and just clone `ResourceAllocator` itself when we want to save/restore. Also, make the creation of `ResourceAllocato` infallible, since we know that the ranges we are using are correct. Finally, fix saving/restoring the state of ResourceAllocator. We were actually not resetting it correctly upon snapshot restore. The reason why this was not a problem is that we don't actually need to perform any new allocations post restore at the moment. However, like this we are ready when we need to perform any hot-plugging operations. Also, add a unit-test to ensure that this logic works correctly. Signed-off-by: Babis Chalios --- Cargo.lock | 4 +- src/pci/Cargo.toml | 2 +- src/vmm/Cargo.toml | 2 +- src/vmm/src/acpi/mod.rs | 33 +++---- src/vmm/src/arch/aarch64/vm.rs | 7 ++ src/vmm/src/arch/x86_64/mod.rs | 20 ++-- src/vmm/src/arch/x86_64/mptable.rs | 31 +++--- src/vmm/src/arch/x86_64/vm.rs | 9 +- src/vmm/src/builder.rs | 6 +- src/vmm/src/device_manager/mmio.rs | 20 ++-- src/vmm/src/device_manager/mod.rs | 2 +- src/vmm/src/device_manager/pci_mngr.rs | 21 ++-- src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/acpi/vmgenid.rs | 4 +- src/vmm/src/devices/pci/pci_segment.rs | 11 +-- src/vmm/src/vstate/resources.rs | 127 ++++++++----------------- src/vmm/src/vstate/vm.rs | 58 ++++++++++- 17 files changed, 185 insertions(+), 174 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6865762136f..84eab68c54c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1618,9 +1618,9 @@ checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" [[package]] name = "vm-allocator" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c2fce39487bd03b5b0ab176f584682e9eaab7875254bafd3d188c69c85fce6e" +checksum = "040a65b0c29f298d71ca45dd52d02b0d0ddc15b9b97d95dfeebe67d6fdd42a28" dependencies = [ "libc", "serde", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index 3549d5010fe..d179854f391 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -18,7 +18,7 @@ libc = "0.2.172" log = "0.4.27" serde = { version = "1.0.219", features = ["derive"] } thiserror = "2.0.12" -vm-allocator = "0.1.2" +vm-allocator = "0.1.3" vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.1", features = [ "backend-mmap", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 8dd8192e42d..fb2752e3c36 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -52,7 +52,7 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } -vm-allocator = { version = "0.1.2", features = ["serde"] } +vm-allocator = { version = "0.1.3", features = ["serde"] } vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.2", features = [ "backend-mmap", diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 51711d9eb92..f3b4164745a 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -54,7 +54,7 @@ impl AcpiTableWriter<'_> { /// buffer. It returns the address in which it wrote the table. fn write_acpi_table( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, table: &mut S, ) -> Result where @@ -83,7 +83,7 @@ impl AcpiTableWriter<'_> { fn build_dsdt( &mut self, device_manager: &mut DeviceManager, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, ) -> Result { let mut dsdt_data = Vec::new(); @@ -111,7 +111,7 @@ impl AcpiTableWriter<'_> { /// This includes a pointer with the location of the DSDT in guest memory fn build_fadt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, dsdt_addr: u64, ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); @@ -129,7 +129,7 @@ impl AcpiTableWriter<'_> { /// This includes information about the interrupt controllers supported in the platform fn build_madt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, nr_vcpus: u8, ) -> Result { let mut madt = Madt::new( @@ -147,7 +147,7 @@ impl AcpiTableWriter<'_> { /// Currently, we pass to the guest just FADT and MADT tables. fn build_xsdt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, fadt_addr: u64, madt_addr: u64, mcfg_addr: u64, @@ -164,7 +164,7 @@ impl AcpiTableWriter<'_> { /// Build the MCFG table for the guest. fn build_mcfg( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, pci_mmio_config_addr: u64, ) -> Result { let mut mcfg = Mcfg::new(OEM_ID, *b"FCMVMCFG", OEM_REVISION, pci_mmio_config_addr); @@ -197,7 +197,7 @@ impl AcpiTableWriter<'_> { pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, device_manager: &mut DeviceManager, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; @@ -249,18 +249,19 @@ mod tests { let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), }; + let mut resource_allocator = vmm.vm.resource_allocator(); // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -275,27 +276,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -312,11 +313,11 @@ mod tests { let mut writer = AcpiTableWriter { mem: vm.guest_memory(), }; - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); let err = writer - .write_acpi_table(&resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index e54723f5b6d..eaec0932a42 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -1,11 +1,14 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Mutex; + use serde::{Deserialize, Serialize}; use crate::Kvm; use crate::arch::aarch64::gic::GicState; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Structure representing the current architecture's understand of what a "virtual machine" is. @@ -74,6 +77,7 @@ impl ArchVm { .get_irqchip() .save_device(mpidrs) .map_err(ArchVmError::SaveGic)?, + resource_allocator: self.resource_allocator().clone(), }) } @@ -86,6 +90,7 @@ impl ArchVm { self.get_irqchip() .restore_device(mpidrs, &state.gic) .map_err(ArchVmError::RestoreGic)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -98,4 +103,6 @@ pub struct VmState { pub memory: GuestMemoryState, /// GIC state. pub gic: GicState, + /// resource allocator + pub resource_allocator: ResourceAllocator, } diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 5307dbdf710..1822abb9009 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -217,7 +217,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vm.guest_memory(), - &vm.common.resource_allocator, + &mut vm.resource_allocator(), vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -241,7 +241,7 @@ pub fn configure_system_for_boot( create_acpi_tables( vm.guest_memory(), device_manager, - &vm.common.resource_allocator, + &mut vm.resource_allocator(), vcpus, )?; Ok(()) @@ -607,8 +607,8 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = single_region_mem(0x10000); - let resource_allocator = ResourceAllocator::new().unwrap(); - let err = mptable::setup_mptable(&gm, &resource_allocator, 1); + let mut resource_allocator = ResourceAllocator::new(); + let err = mptable::setup_mptable(&gm, &mut resource_allocator, 1); assert!(matches!( err.unwrap_err(), mptable::MptableError::NotEnoughMemory @@ -617,24 +617,24 @@ mod tests { // Now assigning some memory that falls before the 32bit memory hole. let mem_size = mib_to_bytes(128); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = mib_to_bytes(3328); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = mib_to_bytes(3330); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); } diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index 17b2900aeb2..a4b1e2fa632 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -116,7 +116,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { /// Performs setup of the MP table for the given `num_cpus`. pub fn setup_mptable( mem: &GuestMemoryMmap, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, num_cpus: u8, ) -> Result<(), MptableError> { if num_cpus > MAX_SUPPORTED_CPUS { @@ -334,27 +334,27 @@ mod tests { fn bounds_check() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); } #[test] fn bounds_check_fails() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus) - 1); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap_err(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap_err(); } #[test] fn mpf_intel_checksum() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -365,9 +365,9 @@ mod tests { fn mpc_table_checksum() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -388,9 +388,9 @@ mod tests { fn mpc_entry_count() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -419,8 +419,9 @@ mod tests { let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(MAX_SUPPORTED_CPUS)); for i in 0..MAX_SUPPORTED_CPUS { - let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &resource_allocator, i).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + + setup_mptable(&mem, &mut resource_allocator, i).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -450,9 +451,9 @@ mod tests { fn cpu_entry_count_max() { let cpus = MAX_SUPPORTED_CPUS + 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - let result = setup_mptable(&mem, &resource_allocator, cpus).unwrap_err(); + let result = setup_mptable(&mem, &mut resource_allocator, cpus).unwrap_err(); assert_eq!(result, MptableError::TooManyCpus); } } diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index fbc27c82a60..e194296928d 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -15,7 +15,7 @@ use crate::arch::x86_64::msr::MsrError; use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; -use crate::vstate::resources::ResourceAllocatorState; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -142,6 +142,7 @@ impl ArchVm { self.fd() .set_irqchip(&state.ioapic) .map_err(ArchVmError::SetIrqChipIoAPIC)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -195,7 +196,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), - resource_allocator: self.common.resource_allocator.save(), + resource_allocator: self.resource_allocator().save(), pitstate, clock, pic_master, @@ -221,7 +222,7 @@ pub struct VmState { /// guest memory state pub memory: GuestMemoryState, /// resource allocator - pub resource_allocator: ResourceAllocatorState, + pub resource_allocator: ResourceAllocator, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index e196ef505c2..b9e5471402c 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -254,7 +254,7 @@ pub fn build_microvm_for_boot( #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&vm.common.resource_allocator, &mut vcpus)?; + setup_pvtime(&mut vm.resource_allocator(), &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } @@ -515,7 +515,7 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { @@ -529,7 +529,7 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] fn setup_pvtime( - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpus: &mut [Vcpu], ) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 13ab13f47ea..fe32376ebb4 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -154,7 +154,7 @@ impl MMIODeviceManager { /// Allocates resources for a new device to be added. fn allocate_mmio_resources( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, irq_count: u32, ) -> Result { let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { @@ -243,7 +243,7 @@ impl MMIODeviceManager { _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { let device = MMIODevice { - resources: self.allocate_mmio_resources(&vm.common.resource_allocator, 1)?, + resources: self.allocate_mmio_resources(&mut vm.resource_allocator(), 1)?, inner: Arc::new(Mutex::new(mmio_device)), }; @@ -277,7 +277,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.common.resource_allocator.allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -336,7 +336,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.common.resource_allocator.allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -754,10 +754,10 @@ pub(crate) mod tests { #[test] fn test_no_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager - .allocate_mmio_resources(&resource_allocator, 0) + .allocate_mmio_resources(&mut resource_allocator, 0) .unwrap(); assert!(device_info.irq.is_none()); } @@ -765,10 +765,10 @@ pub(crate) mod tests { #[test] fn test_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager - .allocate_mmio_resources(&resource_allocator, 1) + .allocate_mmio_resources(&mut resource_allocator, 1) .unwrap(); assert_eq!(device_info.irq.unwrap().get(), crate::arch::IRQ_BASE); } @@ -776,12 +776,12 @@ pub(crate) mod tests { #[test] fn test_allocation_failure() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); assert_eq!( format!( "{}", device_manager - .allocate_mmio_resources(&resource_allocator, 2) + .allocate_mmio_resources(&mut resource_allocator, 2) .unwrap_err() ), "Invalid MMIO IRQ configuration.".to_string() diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 9237538fd60..d1b87375b43 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -221,7 +221,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vm: &Vm, ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &vm.common.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &mut vm.resource_allocator())?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 5c09085e84d..1fc5abf52ef 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::fmt::Debug; +use std::ops::DerefMut; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; @@ -127,7 +128,7 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); - let resource_allocator = &vm.common.resource_allocator; + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration @@ -141,16 +142,14 @@ impl PciDevices { VirtioPciDevice::new(id.clone(), mem, device, msix_vectors, pci_device_bdf.into())?; // Allocate bars - let mut mmio32_allocator = resource_allocator - .mmio32_memory - .lock() - .expect("Poisoned lock"); - let mut mmio64_allocator = resource_allocator - .mmio64_memory - .lock() - .expect("Poisoned lock"); - - virtio_device.allocate_bars(&mut mmio32_allocator, &mut mmio64_allocator, None)?; + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + + virtio_device.allocate_bars( + &mut resource_allocator.mmio32_memory, + &mut resource_allocator.mmio64_memory, + None, + )?; let virtio_device = Arc::new(Mutex::new(virtio_device)); pci_segment diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 87358181df9..f75a14d4a29 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -220,7 +220,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: &constructor_args.vm.common.resource_allocator, + resource_allocator: &mut constructor_args.vm.resource_allocator(), }, vmgenid_args, )?; diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 5c8d4ecbc51..6d096007193 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -86,7 +86,7 @@ impl VmGenId { /// Allocate memory and a GSI for sending notifications and build the device pub fn new( mem: &GuestMemoryMmap, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, ) -> Result { let gsi = resource_allocator.allocate_gsi(1)?; // The generation ID needs to live in an 8-byte aligned buffer @@ -133,7 +133,7 @@ pub struct VMGenIDState { #[derive(Debug)] pub struct VMGenIdConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a ResourceAllocator, + pub resource_allocator: &'a mut ResourceAllocator, } impl<'a> Persist<'a> for VmGenId { diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index c37763eab3a..7deaa027f7b 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -80,14 +80,13 @@ impl PciSegment { PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, )?; - let mem32_allocator = vm.common.resource_allocator.mmio32_memory.clone(); - let mem64_allocator = vm.common.resource_allocator.mmio64_memory.clone(); + let resource_allocator = vm.resource_allocator(); - let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); - let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); + let start_of_mem32_area = resource_allocator.mmio32_memory.base(); + let end_of_mem32_area = resource_allocator.mmio32_memory.end(); - let start_of_mem64_area = mem64_allocator.lock().unwrap().base(); - let end_of_mem64_area = mem64_allocator.lock().unwrap().end(); + let start_of_mem64_area = resource_allocator.mmio64_memory.base(); + let end_of_mem64_area = resource_allocator.mmio64_memory.end(); let segment = PciSegment { id, diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index 3d8d8016e97..8b0cb4a67c4 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::Infallible; -use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; @@ -18,36 +17,44 @@ use crate::snapshot::Persist; /// * GSIs for legacy x86_64 devices /// * GSIs for MMIO devicecs /// * Memory allocations in the MMIO address space -#[derive(Debug)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct ResourceAllocator { /// Allocator for device interrupt lines - pub gsi_allocator: Arc>, + pub gsi_allocator: IdAllocator, /// Allocator for memory in the 32-bit MMIO address space - pub mmio32_memory: Arc>, + pub mmio32_memory: AddressAllocator, /// Allocator for memory in the 64-bit MMIO address space - pub mmio64_memory: Arc>, + pub mmio64_memory: AddressAllocator, /// Memory allocator for system data - pub system_memory: Arc>, + pub system_memory: AddressAllocator, +} + +impl Default for ResourceAllocator { + fn default() -> Self { + ResourceAllocator::new() + } } impl ResourceAllocator { /// Create a new resource allocator for Firecracker devices - pub fn new() -> Result { - Ok(Self { - gsi_allocator: Arc::new(Mutex::new(IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX)?)), - mmio32_memory: Arc::new(Mutex::new(AddressAllocator::new( + pub fn new() -> Self { + // It is fine for us to unwrap the following since we know we are passing valid ranges for + // all allocators + Self { + gsi_allocator: IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), + mmio32_memory: AddressAllocator::new( arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE, - )?)), - mmio64_memory: Arc::new(Mutex::new(AddressAllocator::new( + ) + .unwrap(), + mmio64_memory: AddressAllocator::new( arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE, - )?)), - system_memory: Arc::new(Mutex::new(AddressAllocator::new( - arch::SYSTEM_MEM_START, - arch::SYSTEM_MEM_SIZE, - )?)), - }) + ) + .unwrap(), + system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE) + .unwrap(), + } } /// Allocate a number of GSIs @@ -55,17 +62,16 @@ impl ResourceAllocator { /// # Arguments /// /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&self, gsi_count: u32) -> Result, vm_allocator::Error> { - let mut gsi_allocator = self.gsi_allocator.lock().expect("Poisoned lock"); + pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { let mut gsis = Vec::with_capacity(gsi_count as usize); for _ in 0..gsi_count { - match gsi_allocator.allocate_id() { + match self.gsi_allocator.allocate_id() { Ok(gsi) => gsis.push(gsi), Err(err) => { // It is ok to unwrap here, we just allocated the GSI gsis.into_iter().for_each(|gsi| { - gsi_allocator.free_id(gsi).unwrap(); + self.gsi_allocator.free_id(gsi).unwrap(); }); return Err(err); } @@ -85,15 +91,13 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_32bit_mmio_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio32_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -108,15 +112,13 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_64bit_mmio_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio64_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -131,78 +133,32 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_system_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .system_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } } impl<'a> Persist<'a> for ResourceAllocator { - type State = ResourceAllocatorState; + type State = ResourceAllocator; type ConstructorArgs = (); type Error = Infallible; fn save(&self) -> Self::State { - ResourceAllocatorState { - gsi_allocator: self.gsi_allocator.clone(), - mmio32_memory: self.mmio32_memory.clone(), - mmio64_memory: self.mmio64_memory.clone(), - system_memory: self.system_memory.clone(), - } + self.clone() } fn restore( _constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - Ok(ResourceAllocator { - gsi_allocator: state.gsi_allocator.clone(), - mmio32_memory: state.mmio32_memory.clone(), - mmio64_memory: state.mmio64_memory.clone(), - system_memory: state.system_memory.clone(), - }) - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -/// State of a ResourceAllocator -pub struct ResourceAllocatorState { - /// Allocator for device interrupt lines - pub gsi_allocator: Arc>, - /// Allocator for memory in the 32-bit MMIO address space - pub mmio32_memory: Arc>, - /// Allocator for memory in the 64-bit MMIO address space - pub mmio64_memory: Arc>, - /// Memory allocator for system data - pub system_memory: Arc>, -} - -impl Default for ResourceAllocatorState { - fn default() -> Self { - Self { - gsi_allocator: Arc::new(Mutex::new( - IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), - )), - mmio32_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE) - .unwrap(), - )), - mmio64_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE) - .unwrap(), - )), - system_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE).unwrap(), - )), - } + Ok(state.clone()) } } @@ -210,7 +166,7 @@ impl Default for ResourceAllocatorState { mod tests { use vm_allocator::AllocPolicy; - use super::{ResourceAllocator, ResourceAllocatorState}; + use super::ResourceAllocator; use crate::arch::{self, IRQ_BASE}; use crate::snapshot::{Persist, Snapshot}; @@ -218,7 +174,7 @@ mod tests { #[test] fn test_allocate_gsi() { - let allocator = ResourceAllocator::new().unwrap(); + let mut allocator = ResourceAllocator::new(); // asking for 0 IRQs should return us an empty vector assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); // We cannot allocate more GSIs than available @@ -239,7 +195,7 @@ mod tests { // But we should be able to ask for 0 GSIs assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - let allocator = ResourceAllocator::new().unwrap(); + let mut allocator = ResourceAllocator::new(); // We should be able to allocate 1 GSI assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::IRQ_BASE])); // We can't allocate MAX_IRQS any more @@ -258,18 +214,17 @@ mod tests { fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { let mut buf = vec![0u8; 1024]; Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); - let restored_state: ResourceAllocatorState = - Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let restored_state: ResourceAllocator = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); ResourceAllocator::restore((), &restored_state).unwrap() } #[test] fn test_save_restore() { - let allocator0 = ResourceAllocator::new().unwrap(); + let mut allocator0 = ResourceAllocator::new(); let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; assert_eq!(gsi_0, IRQ_BASE); - let allocator1 = clone_allocator(&allocator0); + let mut allocator1 = clone_allocator(&allocator0); let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; assert_eq!(gsi_1, IRQ_BASE + 1); let mmio32_mem = allocator1 @@ -285,7 +240,7 @@ mod tests { .unwrap(); assert_eq!(system_mem, arch::SYSTEM_MEM_START); - let allocator2 = clone_allocator(&allocator1); + let mut allocator2 = clone_allocator(&allocator1); allocator2 .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) .unwrap_err(); diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 6bdfad5e37b..787cfa12cce 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -10,7 +10,7 @@ use std::fs::OpenOptions; use std::io::Write; use std::path::Path; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, MutexGuard}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; @@ -246,7 +246,7 @@ pub struct VmCommon { /// Interrupts used by Vm's devices pub interrupts: Mutex>, /// Allocator for VM resources - pub resource_allocator: Arc, + pub resource_allocator: Mutex, /// MMIO bus pub mmio_bus: Arc, } @@ -319,7 +319,7 @@ impl Vm { max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), - resource_allocator: Arc::new(ResourceAllocator::new()?), + resource_allocator: Mutex::new(ResourceAllocator::new()), mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -405,6 +405,14 @@ impl Vm { &self.common.guest_memory } + /// Gets a mutable reference to this [`Vm`]'s [`ResourceAllocator`] object + pub fn resource_allocator(&self) -> MutexGuard { + self.common + .resource_allocator + .lock() + .expect("Poisoned lock") + } + /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { self.guest_memory() @@ -578,8 +586,7 @@ impl Vm { debug!("Creating new MSI group with {count} vectors"); let mut irq_routes = HashMap::with_capacity(count as usize); for (gsi, i) in vm - .common - .resource_allocator + .resource_allocator() .allocate_gsi(count as u32)? .iter() .zip(0u32..) @@ -628,6 +635,8 @@ pub(crate) mod tests { use vm_memory::mmap::MmapRegionBuilder; use super::*; + #[cfg(target_arch = "x86_64")] + use crate::snapshot::Snapshot; use crate::test_utils::single_region_mem_raw; use crate::utils::mib_to_bytes; use crate::vstate::kvm::Kvm; @@ -966,4 +975,43 @@ pub(crate) mod tests { assert!(!new_vector.enabled.load(Ordering::Acquire)); } } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_restore_state_resource_allocator() { + use vm_allocator::AllocPolicy; + + let mut snapshot_data = vec![0u8; 10000]; + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + // Allocate a GSI and some memory and make sure they are still allocated after restore + let (gsi, range) = { + let mut resource_allocator = vm.resource_allocator(); + + let gsi = resource_allocator.allocate_gsi(1).unwrap()[0]; + let range = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + (gsi, range) + }; + + let state = vm.save_state().unwrap(); + Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &state).unwrap(); + + let restored_state: VmState = Snapshot::deserialize(&mut snapshot_data.as_slice()).unwrap(); + vm.restore_state(&restored_state).unwrap(); + + let mut resource_allocator = vm.resource_allocator(); + let gsi_new = resource_allocator.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi + 1, gsi_new); + + resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::ExactMatch(range)) + .unwrap_err(); + let range_new = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(range + 1024, range_new); + } } From b500902b5b991fa4c81bfc6ba960ee7250198f0d Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 2 Jul 2025 16:17:48 +0200 Subject: [PATCH 27/27] fix(vsock): pass correct index when triggering interrupts We were confusing queue indexex with event indexes, when passing the index of the queue that needed to be triggered after handling events. Fix the logic to pass the correct index. This refactors a bit the code to signal the queues in each event handler method. With MMIO we don't need to signal each queue independently (one signal will cause the guest to scan all queues). With PCI though, we are using MSI-X, so we need to signal each queue independently. Also, change vsock functional integration tests to also run for PCI-enabled microVMs. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/vsock/device.rs | 2 +- .../src/devices/virtio/vsock/event_handler.rs | 70 ++++++++++--------- .../functional/test_vsock.py | 20 +++--- 3 files changed, 51 insertions(+), 41 deletions(-) diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index e3b003f498e..c4897c82f1e 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -380,7 +380,7 @@ where // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. if self.is_activated() { info!("kick vsock {}.", self.id()); - self.signal_used_queue(0).unwrap(); + self.signal_used_queue(EVQ_INDEX).unwrap(); } } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 47eb2640837..abe408160f1 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -46,75 +46,80 @@ where const PROCESS_EVQ: u32 = 3; const PROCESS_NOTIFY_BACKEND: u32 = 4; - pub fn handle_rxq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_rxq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: rxq unexpected event {:?}", evset); METRICS.rx_queue_event_fails.inc(); - return false; + return; } - let mut raise_irq = false; if let Err(err) = self.queue_events[RXQ_INDEX].read() { error!("Failed to get vsock rx queue event: {:?}", err); METRICS.rx_queue_event_fails.inc(); } else if self.backend.has_pending_rx() { - raise_irq |= self.process_rx(); + if self.process_rx() { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); + } METRICS.rx_queue_event_count.inc(); } - raise_irq } - pub fn handle_txq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_txq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: txq unexpected event {:?}", evset); METRICS.tx_queue_event_fails.inc(); - return false; + return; } - let mut raise_irq = false; if let Err(err) = self.queue_events[TXQ_INDEX].read() { error!("Failed to get vsock tx queue event: {:?}", err); METRICS.tx_queue_event_fails.inc(); } else { - raise_irq |= self.process_tx(); + if self.process_tx() { + self.signal_used_queue(TXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or TX queue"); + } METRICS.tx_queue_event_count.inc(); // The backend may have queued up responses to the packets we sent during // TX queue processing. If that happened, we need to fetch those responses // and place them into RX buffers. - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx(); + if self.backend.has_pending_rx() && self.process_rx() { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); } } - raise_irq } - pub fn handle_evq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_evq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: evq unexpected event {:?}", evset); METRICS.ev_queue_event_fails.inc(); - return false; + return; } if let Err(err) = self.queue_events[EVQ_INDEX].read() { error!("Failed to consume vsock evq event: {:?}", err); METRICS.ev_queue_event_fails.inc(); } - false } /// Notify backend of new events. - pub fn notify_backend(&mut self, evset: EventSet) -> bool { + pub fn notify_backend(&mut self, evset: EventSet) { self.backend.notify(evset); // After the backend has been kicked, it might've freed up some resources, so we // can attempt to send it more data to process. // In particular, if `self.backend.send_pkt()` halted the TX queue processing (by // returning an error) at some point in the past, now is the time to try walking the // TX queue again. - let mut raise_irq = self.process_tx(); - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx(); + if self.process_tx() { + self.signal_used_queue(TXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or TX queue"); + } + if self.backend.has_pending_rx() && self.process_rx() { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); } - raise_irq } fn register_runtime_events(&self, ops: &mut EventOps) { @@ -182,19 +187,14 @@ where let evset = event.event_set(); if self.is_activated() { - let mut raise_irq = false; match source { Self::PROCESS_ACTIVATE => self.handle_activate_event(ops), - Self::PROCESS_RXQ => raise_irq = self.handle_rxq_event(evset), - Self::PROCESS_TXQ => raise_irq = self.handle_txq_event(evset), - Self::PROCESS_EVQ => raise_irq = self.handle_evq_event(evset), - Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset), + Self::PROCESS_RXQ => self.handle_rxq_event(evset), + Self::PROCESS_TXQ => self.handle_txq_event(evset), + Self::PROCESS_EVQ => self.handle_evq_event(evset), + Self::PROCESS_NOTIFY_BACKEND => self.notify_backend(evset), _ => warn!("Unexpected vsock event received: {:?}", source), }; - if raise_irq { - self.signal_used_queue(source as usize) - .expect("vsock: Could not trigger device interrupt"); - } } else { warn!( "Vsock: The device is not yet activated. Spurious event received: {:?}", @@ -302,7 +302,9 @@ mod tests { let mut ctx = test_ctx.create_event_handler_context(); ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); - assert!(!ctx.device.handle_txq_event(EventSet::IN)); + let metric_before = METRICS.tx_queue_event_fails.count(); + ctx.device.handle_txq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.tx_queue_event_fails.count()); } } @@ -363,7 +365,9 @@ mod tests { let mut ctx = test_ctx.create_event_handler_context(); ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_rxq_event(EventSet::IN)); + let metric_before = METRICS.rx_queue_event_fails.count(); + ctx.device.handle_rxq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.rx_queue_event_fails.count()); } } @@ -374,7 +378,9 @@ mod tests { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_evq_event(EventSet::IN)); + let metric_before = METRICS.ev_queue_event_fails.count(); + ctx.device.handle_evq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.ev_queue_event_fails.count()); } } diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index dfa02510b37..5b6221c32a9 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -37,7 +37,7 @@ TEST_WORKER_COUNT = 10 -def test_vsock(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): +def test_vsock(uvm_plain_any, pci_enabled, bin_vsock_path, test_fc_session_root_path): """ Test guest and host vsock initiated connections. @@ -45,7 +45,7 @@ def test_vsock(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): """ vm = uvm_plain_any - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config() vm.add_net_iface() @@ -102,12 +102,12 @@ def negative_test_host_connections(vm, blob_path, blob_hash): validate_fc_metrics(metrics) -def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): +def test_vsock_epipe(uvm_plain, pci_enabled, bin_vsock_path, test_fc_session_root_path): """ Vsock negative test to validate SIGPIPE/EPIPE handling. """ vm = uvm_plain - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config() vm.add_net_iface() vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") @@ -129,7 +129,7 @@ def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): def test_vsock_transport_reset_h2g( - uvm_nano, microvm_factory, bin_vsock_path, test_fc_session_root_path + uvm_plain, pci_enabled, microvm_factory, bin_vsock_path, test_fc_session_root_path ): """ Vsock transport reset test. @@ -146,7 +146,9 @@ def test_vsock_transport_reset_h2g( 6. Close VM -> Load VM from Snapshot -> check that vsock device is still working. """ - test_vm = uvm_nano + test_vm = uvm_plain + test_vm.spawn(pci=pci_enabled) + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start() @@ -213,11 +215,13 @@ def test_vsock_transport_reset_h2g( validate_fc_metrics(metrics) -def test_vsock_transport_reset_g2h(uvm_nano, microvm_factory): +def test_vsock_transport_reset_g2h(uvm_plain, pci_enabled, microvm_factory): """ Vsock transport reset test. """ - test_vm = uvm_nano + test_vm = uvm_plain + test_vm.spawn(pci=pci_enabled) + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start()