Skip to content

nvme: Print controller GUID in shutdown_nvme_driver span #1619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,7 @@ async fn make_disk_type_from_physical_device(
// We can't validate yet that this namespace actually exists. That will
// be checked later.
return Ok(Resource::new(NvmeDiskConfig {
controller_instance_id: controller_instance_id.to_string(),
pci_id,
nsid: sub_device_path,
}));
Expand Down
46 changes: 34 additions & 12 deletions openhcl/underhill_core/src/nvme_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ impl NvmeManager {
enum Request {
Inspect(inspect::Deferred),
ForceLoadDriver(inspect::DeferredUpdate),
GetNamespace(Rpc<(String, u32), Result<nvme_driver::Namespace, NamespaceError>>),
GetNamespace(Rpc<(String, String, u32), Result<nvme_driver::Namespace, NamespaceError>>),
Save(Rpc<(), Result<NvmeManagerSavedState, anyhow::Error>>),
Shutdown {
span: tracing::Span,
Expand All @@ -191,13 +191,22 @@ pub struct NvmeManagerClient {
impl NvmeManagerClient {
pub async fn get_namespace(
&self,
controller_instance_id: String,
pci_id: String,
nsid: u32,
) -> anyhow::Result<nvme_driver::Namespace> {
Ok(self
.sender
.call(Request::GetNamespace, (pci_id.clone(), nsid))
.instrument(tracing::info_span!("nvme_get_namespace", pci_id, nsid))
.call(
Request::GetNamespace,
(controller_instance_id.clone(), pci_id.clone(), nsid),
)
.instrument(tracing::info_span!(
"nvme_get_namespace",
controller_instance_id,
pci_id,
nsid
))
.await
.context("nvme manager is shut down")??)
}
Expand Down Expand Up @@ -235,7 +244,7 @@ impl NvmeManagerWorker {
match req {
Request::Inspect(deferred) => deferred.inspect(&self),
Request::ForceLoadDriver(update) => {
match self.get_driver(update.new_value().to_owned()).await {
match self.get_driver(None, update.new_value().to_owned()).await {
Ok(_) => {
let pci_id = update.new_value().to_string();
update.succeed(pci_id);
Expand All @@ -246,8 +255,8 @@ impl NvmeManagerWorker {
}
}
Request::GetNamespace(rpc) => {
rpc.handle(async |(pci_id, nsid)| {
self.get_namespace(pci_id.clone(), nsid)
rpc.handle(async |(controller_instance_id, pci_id, nsid)| {
self.get_namespace(controller_instance_id, pci_id.clone(), nsid)
.map_err(|source| NamespaceError { pci_id, source })
.await
})
Expand Down Expand Up @@ -283,9 +292,12 @@ impl NvmeManagerWorker {
if !nvme_keepalive || !self.save_restore_supported {
async {
join_all(self.devices.drain().map(|(pci_id, driver)| {
driver
.shutdown()
.instrument(tracing::info_span!("shutdown_nvme_driver", pci_id))
let controller_instance_id = driver.controller_instance_id();
driver.shutdown().instrument(tracing::info_span!(
"shutdown_nvme_driver",
pci_id,
controller_instance_id
))
}))
.await
}
Expand All @@ -296,6 +308,7 @@ impl NvmeManagerWorker {

async fn get_driver(
&mut self,
controller_instance_id: Option<String>,
pci_id: String,
) -> Result<&mut nvme_driver::NvmeDriver<VfioDevice>, InnerError> {
let driver = match self.devices.entry(pci_id.to_owned()) {
Expand Down Expand Up @@ -323,7 +336,7 @@ impl NvmeManagerWorker {
// TODO: For now, any isolation means use bounce buffering. This
// needs to change when we have nvme devices that support DMA to
// confidential memory.
let driver = nvme_driver::NvmeDriver::new(
let mut driver = nvme_driver::NvmeDriver::new(
&self.driver_source,
self.vp_count,
device,
Expand All @@ -336,6 +349,11 @@ impl NvmeManagerWorker {
.await
.map_err(InnerError::DeviceInitFailed)?;

// Pass through controller instance ID if specified.
if let Some(controller_instance_id) = controller_instance_id {
driver.set_controller_instance_id(controller_instance_id);
}

entry.insert(driver)
}
};
Expand All @@ -344,10 +362,13 @@ impl NvmeManagerWorker {

async fn get_namespace(
&mut self,
controller_instance_id: String,
pci_id: String,
nsid: u32,
) -> Result<nvme_driver::Namespace, InnerError> {
let driver = self.get_driver(pci_id.to_owned()).await?;
let driver = self
.get_driver(Some(controller_instance_id), pci_id.to_owned())
.await?;
driver
.namespace(nsid)
.await
Expand Down Expand Up @@ -440,7 +461,7 @@ impl AsyncResolveResource<DiskHandleKind, NvmeDiskConfig> for NvmeDiskResolver {
) -> Result<Self::Output, Self::Error> {
let namespace = self
.manager
.get_namespace(rsrc.pci_id, rsrc.nsid)
.get_namespace(rsrc.controller_instance_id, rsrc.pci_id, rsrc.nsid)
.await
.context("could not open nvme namespace")?;

Expand All @@ -450,6 +471,7 @@ impl AsyncResolveResource<DiskHandleKind, NvmeDiskConfig> for NvmeDiskResolver {

#[derive(MeshPayload, Default)]
pub struct NvmeDiskConfig {
pub controller_instance_id: String,
pub pci_id: String,
pub nsid: u32,
}
Expand Down
17 changes: 17 additions & 0 deletions vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ pub struct NvmeDriver<T: DeviceBacking> {
#[inspect(flatten)]
task: Option<TaskControl<DriverWorkerTask<T>, WorkerState>>,
device_id: String,
controller_instance_id: Option<String>,
identify: Option<Arc<spec::IdentifyController>>,
#[inspect(skip)]
driver: VmTaskDriver,
Expand Down Expand Up @@ -248,6 +249,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {

Ok(Self {
device_id: device.id().to_owned(),
controller_instance_id: None,
task: Some(TaskControl::new(DriverWorkerTask {
device,
driver: driver.clone(),
Expand Down Expand Up @@ -555,6 +557,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
// TODO: See the description above, save the vector once resolved.
namespaces: vec![],
worker_data: s,
controller_instance_id: self.controller_instance_id.clone(),
})
}
Err(e) => Err(e),
Expand Down Expand Up @@ -594,6 +597,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {

let mut this = Self {
device_id: device.id().to_owned(),
controller_instance_id: saved_state.controller_instance_id.clone(),
task: Some(TaskControl::new(DriverWorkerTask {
device,
driver: driver.clone(),
Expand Down Expand Up @@ -739,6 +743,16 @@ impl<T: DeviceBacking> NvmeDriver<T> {
pub fn update_servicing_flags(&mut self, nvme_keepalive: bool) {
self.nvme_keepalive = nvme_keepalive;
}

/// Get the controller instance ID, if set.
pub fn controller_instance_id(&self) -> Option<String> {
self.controller_instance_id.clone()
}

/// Set the controller instance ID.
pub fn set_controller_instance_id(&mut self, controller_instance_id: String) {
self.controller_instance_id = Some(controller_instance_id);
}
}

async fn handle_asynchronous_events(
Expand Down Expand Up @@ -1051,6 +1065,9 @@ pub mod save_restore {
/// NVMe driver worker task data.
#[mesh(4)]
pub worker_data: NvmeDriverWorkerSavedState,
/// Controller instance ID.
#[mesh(5)]
pub controller_instance_id: Option<String>,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would we need to save this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't have this, where does it get set back to its value when we do a restore, since we do not call GetDriver (the only other path to set it)?

}

/// Save/restore state for NVMe driver worker task.
Expand Down