Skip to content

nvme: Pass controller instance ID to NvmeDriver for better tracing #1661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,7 @@ async fn make_disk_type_from_physical_device(
// We can't validate yet that this namespace actually exists. That will
// be checked later.
return Ok(Resource::new(NvmeDiskConfig {
name: controller_instance_id.to_string(),
pci_id,
nsid: sub_device_path,
}));
Expand Down
2 changes: 1 addition & 1 deletion openhcl/underhill_core/src/emuplat/netvsp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ async fn try_create_mana_device(
max_sub_channels: u16,
dma_client: Arc<dyn DmaClient>,
) -> anyhow::Result<ManaDevice<VfioDevice>> {
let device = VfioDevice::new(driver_source, pci_id, dma_client)
let device = VfioDevice::new(driver_source, pci_id, None, dma_client)
.await
.context("failed to open device")?;

Expand Down
100 changes: 79 additions & 21 deletions openhcl/underhill_core/src/nvme_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@ use std::collections::HashMap;
use std::collections::hash_map;
use thiserror::Error;
use tracing::Instrument;
use user_driver::vfio::PciId;

/// Strongly typed wrapper for NVMe device name (formerly debug_id/controller_instance_id)
#[derive(Debug, Clone, PartialEq, Eq, MeshPayload)]
pub struct NvmeDeviceName(pub String);

/// Parameters for getting an NVMe namespace
#[derive(Debug, Clone, MeshPayload)]
pub struct GetNamespaceParams {
pub name: NvmeDeviceName,
pub pci_id: PciId,
pub nsid: u32,
}
use user_driver::vfio::VfioDevice;
use vm_resource::AsyncResolveResource;
use vm_resource::ResourceId;
Expand Down Expand Up @@ -175,7 +188,7 @@ impl NvmeManager {
enum Request {
Inspect(inspect::Deferred),
ForceLoadDriver(inspect::DeferredUpdate),
GetNamespace(Rpc<(String, u32), Result<nvme_driver::Namespace, NamespaceError>>),
GetNamespace(Rpc<GetNamespaceParams, Result<nvme_driver::Namespace, NamespaceError>>),
Save(Rpc<(), Result<NvmeManagerSavedState, anyhow::Error>>),
Shutdown {
span: tracing::Span,
Expand All @@ -191,13 +204,24 @@ pub struct NvmeManagerClient {
impl NvmeManagerClient {
pub async fn get_namespace(
&self,
name: String,
pci_id: String,
nsid: u32,
) -> anyhow::Result<nvme_driver::Namespace> {
let params = GetNamespaceParams {
name: NvmeDeviceName(name),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, the specific type names look weird. Since you have defined a GetNamespaceParams struct, this (and the PciId) can just be String.

In addition, let's change name to debug_host_id wherever you are making changes in NvmeManagerClient

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed GetNamespaceParams to use String types instead of NvmeDeviceName and PciId wrappers, and renamed the parameter from name to debug_host_id throughout NvmeManagerClient as requested. (commit 38bc76e)

pci_id: PciId(pci_id),
nsid,
};
Ok(self
.sender
.call(Request::GetNamespace, (pci_id.clone(), nsid))
.instrument(tracing::info_span!("nvme_get_namespace", pci_id, nsid))
.call(Request::GetNamespace, params.clone())
.instrument(tracing::info_span!(
"nvme_get_namespace",
name = params.name.0,
pci_id = params.pci_id.0,
nsid
))
.await
.context("nvme manager is shut down")??)
}
Expand Down Expand Up @@ -235,7 +259,10 @@ impl NvmeManagerWorker {
match req {
Request::Inspect(deferred) => deferred.inspect(&self),
Request::ForceLoadDriver(update) => {
match self.get_driver(update.new_value().to_owned()).await {
match self
.get_driver("force-load".to_string(), update.new_value().to_owned())
.await
{
Ok(_) => {
let pci_id = update.new_value().to_string();
update.succeed(pci_id);
Expand All @@ -246,9 +273,12 @@ impl NvmeManagerWorker {
}
}
Request::GetNamespace(rpc) => {
rpc.handle(async |(pci_id, nsid)| {
self.get_namespace(pci_id.clone(), nsid)
.map_err(|source| NamespaceError { pci_id, source })
rpc.handle(async |params| {
self.get_namespace(params.name.0, params.pci_id.0.clone(), params.nsid)
.map_err(|source| NamespaceError {
pci_id: params.pci_id.0,
source,
})
.await
})
.await
Expand Down Expand Up @@ -283,9 +313,12 @@ impl NvmeManagerWorker {
if !nvme_keepalive || !self.save_restore_supported {
async {
join_all(self.devices.drain().map(|(pci_id, driver)| {
driver
.shutdown()
.instrument(tracing::info_span!("shutdown_nvme_driver", pci_id))
let name = driver.name();
driver.shutdown().instrument(tracing::info_span!(
"shutdown_nvme_driver",
pci_id,
name
))
}))
.await
}
Expand All @@ -296,6 +329,7 @@ impl NvmeManagerWorker {

async fn get_driver(
&mut self,
name: String,
pci_id: String,
) -> Result<&mut nvme_driver::NvmeDriver<VfioDevice>, InnerError> {
let driver = match self.devices.entry(pci_id.to_owned()) {
Expand All @@ -315,10 +349,15 @@ impl NvmeManagerWorker {
})
.map_err(InnerError::DmaClient)?;

let device = VfioDevice::new(&self.driver_source, entry.key(), dma_client)
.instrument(tracing::info_span!("vfio_device_open", pci_id))
.await
.map_err(InnerError::Vfio)?;
let device = VfioDevice::new(
&self.driver_source,
entry.key(),
Some(name.clone()),
dma_client,
)
.instrument(tracing::info_span!("vfio_device_open", pci_id))
.await
.map_err(InnerError::Vfio)?;

// TODO: For now, any isolation means use bounce buffering. This
// needs to change when we have nvme devices that support DMA to
Expand All @@ -328,6 +367,7 @@ impl NvmeManagerWorker {
self.vp_count,
device,
self.is_isolated,
name,
)
.instrument(tracing::info_span!(
"nvme_driver_init",
Expand All @@ -344,10 +384,11 @@ impl NvmeManagerWorker {

async fn get_namespace(
&mut self,
name: String,
pci_id: String,
nsid: u32,
) -> Result<nvme_driver::Namespace, InnerError> {
let driver = self.get_driver(pci_id.to_owned()).await?;
let driver = self.get_driver(name, pci_id.to_owned()).await?;
driver
.namespace(nsid)
.await
Expand Down Expand Up @@ -393,10 +434,15 @@ impl NvmeManagerWorker {
// This code can wait on each VFIO device until it is arrived.
// A potential optimization would be to delay VFIO operation
// until it is ready, but a redesign of VfioDevice is needed.
let vfio_device =
VfioDevice::restore(&self.driver_source, &disk.pci_id.clone(), true, dma_client)
.instrument(tracing::info_span!("vfio_device_restore", pci_id))
.await?;
let vfio_device = VfioDevice::restore(
&self.driver_source,
&disk.pci_id.clone(),
Some(format!("restored-{}", pci_id)),
true,
dma_client,
)
.instrument(tracing::info_span!("vfio_device_restore", pci_id))
.await?;

// TODO: For now, any isolation means use bounce buffering. This
// needs to change when we have nvme devices that support DMA to
Expand All @@ -407,6 +453,7 @@ impl NvmeManagerWorker {
vfio_device,
&disk.driver_state,
self.is_isolated,
format!("restored-{}", disk.pci_id), // Use PCI ID as name for restored drivers
)
.instrument(tracing::info_span!("nvme_driver_restore"))
.await?;
Expand Down Expand Up @@ -440,20 +487,31 @@ impl AsyncResolveResource<DiskHandleKind, NvmeDiskConfig> for NvmeDiskResolver {
) -> Result<Self::Output, Self::Error> {
let namespace = self
.manager
.get_namespace(rsrc.pci_id, rsrc.nsid)
.get_namespace(rsrc.debug_id, rsrc.pci_id, rsrc.nsid)
.await
.context("could not open nvme namespace")?;

Ok(ResolvedDisk::new(disk_nvme::NvmeDisk::new(namespace)).context("invalid disk")?)
}
}

#[derive(MeshPayload, Default)]
#[derive(MeshPayload)]
pub struct NvmeDiskConfig {
pub name: String,
pub pci_id: String,
pub nsid: u32,
}

impl Default for NvmeDiskConfig {
fn default() -> Self {
Self {
name: "force-load".to_string(),
pci_id: String::new(),
nsid: 0,
}
}
}

impl ResourceId<DiskHandleKind> for NvmeDiskConfig {
const ID: &'static str = "nvme";
}
Expand Down
Loading