Skip to content

nvme: Pass controller instance ID to NvmeDriver for better tracing #1661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,7 @@ async fn make_disk_type_from_physical_device(
// We can't validate yet that this namespace actually exists. That will
// be checked later.
return Ok(Resource::new(NvmeDiskConfig {
controller_instance_id: Some(controller_instance_id.to_string()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename controller_instance_id to debug_id

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed controller_instance_id to debug_id throughout the codebase in commit e76bedf.

pci_id,
nsid: sub_device_path,
}));
Expand Down
44 changes: 33 additions & 11 deletions openhcl/underhill_core/src/nvme_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ impl NvmeManager {
enum Request {
Inspect(inspect::Deferred),
ForceLoadDriver(inspect::DeferredUpdate),
GetNamespace(Rpc<(String, u32), Result<nvme_driver::Namespace, NamespaceError>>),
GetNamespace(
Rpc<(Option<String>, String, u32), Result<nvme_driver::Namespace, NamespaceError>>,
),
Save(Rpc<(), Result<NvmeManagerSavedState, anyhow::Error>>),
Shutdown {
span: tracing::Span,
Expand All @@ -191,13 +193,22 @@ pub struct NvmeManagerClient {
impl NvmeManagerClient {
pub async fn get_namespace(
&self,
controller_instance_id: Option<String>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this mandatory (a String rather than Option<String>)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made the parameter mandatory by changing from Option<String> to String throughout the codebase in commit e76bedf.

pci_id: String,
nsid: u32,
) -> anyhow::Result<nvme_driver::Namespace> {
Ok(self
.sender
.call(Request::GetNamespace, (pci_id.clone(), nsid))
.instrument(tracing::info_span!("nvme_get_namespace", pci_id, nsid))
.call(
Request::GetNamespace,
(controller_instance_id.clone(), pci_id.clone(), nsid),
)
.instrument(tracing::info_span!(
"nvme_get_namespace",
controller_instance_id = controller_instance_id.as_ref().map(|s| s.as_str()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the .as_ref() necessary here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The .as_ref() is no longer necessary since the parameter is now a String instead of Option<String>. Simplified the tracing span code in commit e76bedf.

pci_id,
nsid
))
.await
.context("nvme manager is shut down")??)
}
Expand Down Expand Up @@ -235,7 +246,7 @@ impl NvmeManagerWorker {
match req {
Request::Inspect(deferred) => deferred.inspect(&self),
Request::ForceLoadDriver(update) => {
match self.get_driver(update.new_value().to_owned()).await {
match self.get_driver(None, update.new_value().to_owned()).await {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of "None", create a dummy string: force-load

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replaced None with "force-load" dummy string in the ForceLoadDriver case in commit e76bedf.

Ok(_) => {
let pci_id = update.new_value().to_string();
update.succeed(pci_id);
Expand All @@ -246,8 +257,8 @@ impl NvmeManagerWorker {
}
}
Request::GetNamespace(rpc) => {
rpc.handle(async |(pci_id, nsid)| {
self.get_namespace(pci_id.clone(), nsid)
rpc.handle(async |(controller_instance_id, pci_id, nsid)| {
self.get_namespace(controller_instance_id, pci_id.clone(), nsid)
.map_err(|source| NamespaceError { pci_id, source })
.await
})
Expand Down Expand Up @@ -283,9 +294,13 @@ impl NvmeManagerWorker {
if !nvme_keepalive || !self.save_restore_supported {
async {
join_all(self.devices.drain().map(|(pci_id, driver)| {
driver
.shutdown()
.instrument(tracing::info_span!("shutdown_nvme_driver", pci_id))
let controller_instance_id = driver.controller_instance_id();
driver.shutdown().instrument(tracing::info_span!(
"shutdown_nvme_driver",
pci_id,
controller_instance_id =
controller_instance_id.as_ref().map(|s| s.as_str())
))
}))
.await
}
Expand All @@ -296,6 +311,7 @@ impl NvmeManagerWorker {

async fn get_driver(
&mut self,
controller_instance_id: Option<String>,
pci_id: String,
) -> Result<&mut nvme_driver::NvmeDriver<VfioDevice>, InnerError> {
let driver = match self.devices.entry(pci_id.to_owned()) {
Expand Down Expand Up @@ -328,6 +344,7 @@ impl NvmeManagerWorker {
self.vp_count,
device,
self.is_isolated,
controller_instance_id,
)
.instrument(tracing::info_span!(
"nvme_driver_init",
Expand All @@ -344,10 +361,13 @@ impl NvmeManagerWorker {

async fn get_namespace(
&mut self,
controller_instance_id: Option<String>,
pci_id: String,
nsid: u32,
) -> Result<nvme_driver::Namespace, InnerError> {
let driver = self.get_driver(pci_id.to_owned()).await?;
let driver = self
.get_driver(controller_instance_id, pci_id.to_owned())
.await?;
driver
.namespace(nsid)
.await
Expand Down Expand Up @@ -407,6 +427,7 @@ impl NvmeManagerWorker {
vfio_device,
&disk.driver_state,
self.is_isolated,
None, // controller_instance_id is not persisted in saved state
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unfortunate. Is there a way to re-hook-up the name (formerly known as controller_instance_id and debug_id) after a restore, without putting it in the NvmeManagerSavedState?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For restored drivers, I now generate the name as "restored-{pci_id}" using the available PCI ID. This avoids storing the name in NvmeManagerSavedState while still providing a meaningful identifier for tracing. (commit 02fbdbc)

)
.instrument(tracing::info_span!("nvme_driver_restore"))
.await?;
Expand Down Expand Up @@ -440,7 +461,7 @@ impl AsyncResolveResource<DiskHandleKind, NvmeDiskConfig> for NvmeDiskResolver {
) -> Result<Self::Output, Self::Error> {
let namespace = self
.manager
.get_namespace(rsrc.pci_id, rsrc.nsid)
.get_namespace(rsrc.controller_instance_id, rsrc.pci_id, rsrc.nsid)
.await
.context("could not open nvme namespace")?;

Expand All @@ -450,6 +471,7 @@ impl AsyncResolveResource<DiskHandleKind, NvmeDiskConfig> for NvmeDiskResolver {

#[derive(MeshPayload, Default)]
pub struct NvmeDiskConfig {
pub controller_instance_id: Option<String>,
pub pci_id: String,
pub nsid: u32,
}
Expand Down
23 changes: 20 additions & 3 deletions vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ pub struct NvmeDriver<T: DeviceBacking> {
#[inspect(flatten)]
task: Option<TaskControl<DriverWorkerTask<T>, WorkerState>>,
device_id: String,
controller_instance_id: Option<String>,
identify: Option<Arc<spec::IdentifyController>>,
#[inspect(skip)]
driver: VmTaskDriver,
Expand Down Expand Up @@ -179,11 +180,18 @@ impl<T: DeviceBacking> NvmeDriver<T> {
cpu_count: u32,
device: T,
bounce_buffer: bool,
controller_instance_id: Option<String>,
) -> anyhow::Result<Self> {
let pci_id = device.id().to_owned();
let mut this = Self::new_disabled(driver_source, cpu_count, device, bounce_buffer)
.instrument(tracing::info_span!("nvme_new_disabled", pci_id))
.await?;
let mut this = Self::new_disabled(
driver_source,
cpu_count,
device,
bounce_buffer,
controller_instance_id,
)
.instrument(tracing::info_span!("nvme_new_disabled", pci_id))
.await?;
match this
.enable(cpu_count as u16)
.instrument(tracing::info_span!("nvme_enable", pci_id))
Expand All @@ -208,6 +216,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
cpu_count: u32,
mut device: T,
bounce_buffer: bool,
controller_instance_id: Option<String>,
) -> anyhow::Result<Self> {
let driver = driver_source.simple();
let bar0 = Bar0(
Expand Down Expand Up @@ -248,6 +257,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {

Ok(Self {
device_id: device.id().to_owned(),
controller_instance_id,
task: Some(TaskControl::new(DriverWorkerTask {
device,
driver: driver.clone(),
Expand Down Expand Up @@ -568,6 +578,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
mut device: T,
saved_state: &NvmeDriverSavedState,
bounce_buffer: bool,
controller_instance_id: Option<String>,
) -> anyhow::Result<Self> {
let driver = driver_source.simple();
let bar0_mapping = device
Expand All @@ -594,6 +605,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {

let mut this = Self {
device_id: device.id().to_owned(),
controller_instance_id,
task: Some(TaskControl::new(DriverWorkerTask {
device,
driver: driver.clone(),
Expand Down Expand Up @@ -739,6 +751,11 @@ impl<T: DeviceBacking> NvmeDriver<T> {
pub fn update_servicing_flags(&mut self, nvme_keepalive: bool) {
self.nvme_keepalive = nvme_keepalive;
}

/// Get the controller instance ID, if set.
pub fn controller_instance_id(&self) -> Option<String> {
self.controller_instance_id.clone()
}
}

async fn handle_asynchronous_events(
Expand Down
10 changes: 5 additions & 5 deletions vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ async fn test_nvme_ioqueue_max_mqes(driver: DefaultDriver) {
let cap: Cap = Cap::new().with_mqes_z(max_u16);
device.set_mock_response_u64(Some((0, cap.into())));

let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false).await;
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false, None).await;
assert!(driver.is_ok());
}

Expand Down Expand Up @@ -113,7 +113,7 @@ async fn test_nvme_ioqueue_invalid_mqes(driver: DefaultDriver) {
// Setup mock response at offset 0
let cap: Cap = Cap::new().with_mqes_z(0);
device.set_mock_response_u64(Some((0, cap.into())));
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false).await;
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false, None).await;

assert!(driver.is_err());
}
Expand Down Expand Up @@ -150,7 +150,7 @@ async fn test_nvme_driver(driver: DefaultDriver, allow_dma: bool) {
.await
.unwrap();
let device = NvmeTestEmulatedDevice::new(nvme, msi_set, dma_client.clone());
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false)
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false, None)
.await
.unwrap();
let namespace = driver.namespace(1).await.unwrap();
Expand Down Expand Up @@ -266,7 +266,7 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) {
.unwrap();

let device = NvmeTestEmulatedDevice::new(nvme_ctrl, msi_x, dma_client.clone());
let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false)
let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false, None)
.await
.unwrap();
let _ns1 = nvme_driver.namespace(1).await.unwrap();
Expand Down Expand Up @@ -304,7 +304,7 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) {

let _new_device = NvmeTestEmulatedDevice::new(new_nvme_ctrl, new_msi_x, dma_client.clone());
// TODO: Memory restore is disabled for emulated DMA, uncomment once fixed.
// let _new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state)
// let _new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state, false, None)
// .await
// .unwrap();
}
Expand Down