From 7b54128a3cbe1ee53bf7d2fa6b78cf77c373eb62 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 28 Apr 2023 09:54:56 -0400 Subject: [PATCH 01/39] [sled-agent] Make service_manager responsible for storage services too --- illumos-utils/src/zpool.rs | 1 + openapi/sled-agent.json | 109 ++++- sled-agent/src/bootstrap/hardware.rs | 5 +- sled-agent/src/http_entrypoints.rs | 1 + sled-agent/src/lib.rs | 1 + sled-agent/src/params.rs | 49 +- sled-agent/src/rack_setup/plan/service.rs | 10 +- sled-agent/src/services.rs | 291 +++++++++-- sled-agent/src/sled_agent.rs | 37 +- sled-agent/src/storage/dataset.rs | 72 +++ sled-agent/src/storage/mod.rs | 7 + sled-agent/src/storage_manager.rs | 560 ++-------------------- smf/cockroachdb/method_script.sh | 6 + 13 files changed, 534 insertions(+), 615 deletions(-) create mode 100644 sled-agent/src/storage/dataset.rs create mode 100644 sled-agent/src/storage/mod.rs diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index cd0fa847c78..dc4507f0ea8 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -249,6 +249,7 @@ impl Zpool { } #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, JsonSchema)] +#[serde(rename_all = "snake_case")] pub enum ZpoolKind { // This zpool is used for external storage (u.2) External, diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index fd93c6cc34c..b77c318eadd 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -640,13 +640,6 @@ { "type": "object", "properties": { - "all_addresses": { - "description": "The addresses of all nodes within the cluster.", - "type": "array", - "items": { - "type": "string" - } - }, "type": { "type": "string", "enum": [ @@ -655,7 +648,6 @@ } }, "required": [ - "all_addresses", "type" ] }, @@ -689,6 +681,21 @@ } ] }, + "DatasetName": { + "type": "object", + "properties": { + "kind": { + "$ref": "#/components/schemas/DatasetKind" + }, + "pool_name": { + "$ref": "#/components/schemas/ZpoolName" + } + }, + "required": [ + "kind", + "pool_name" + ] + }, "DendriteAsic": { "type": "string", "enum": [ @@ -1738,7 +1745,7 @@ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, "ServiceEnsureBody": { - "description": "Used to request that the Sled initialize certain services on initialization.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", + "description": "Used to request that the Sled initialize certain services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", "type": "object", "properties": { "services": { @@ -2036,6 +2043,48 @@ "mode", "type" ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "clickhouse" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "cockroach_db" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "crucible" + ] + } + }, + "required": [ + "type" + ] } ] }, @@ -2050,6 +2099,15 @@ "format": "ipv6" } }, + "dataset": { + "nullable": true, + "default": null, + "allOf": [ + { + "$ref": "#/components/schemas/DatasetName" + } + ] + }, "gz_addresses": { "default": [], "type": "array", @@ -2465,13 +2523,16 @@ "description": "The type of zone which may be requested from Sled Agent", "type": "string", "enum": [ + "clickhouse", + "cockroach_db", + "crucible_pantry", + "crucible", "external_dns", "internal_dns", "nexus", + "ntp", "oximeter", - "switch", - "crucible_pantry", - "ntp" + "switch" ] }, "Zpool": { @@ -2489,6 +2550,30 @@ "disk_type", "id" ] + }, + "ZpoolKind": { + "type": "string", + "enum": [ + "external", + "internal" + ] + }, + "ZpoolName": { + "description": "A wrapper around a zpool name.\n\nThis expects that the format will be: `ox{i,p}_` - we parse the prefix when reading the structure, and validate that the UUID can be utilized.", + "type": "object", + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "kind": { + "$ref": "#/components/schemas/ZpoolKind" + } + }, + "required": [ + "id", + "kind" + ] } } } diff --git a/sled-agent/src/bootstrap/hardware.rs b/sled-agent/src/bootstrap/hardware.rs index 491d3416177..2bb068438bc 100644 --- a/sled-agent/src/bootstrap/hardware.rs +++ b/sled-agent/src/bootstrap/hardware.rs @@ -177,10 +177,7 @@ impl HardwareMonitor { let hardware = HardwareManager::new(log, sled_mode) .map_err(|e| Error::Hardware(e))?; - // TODO: The coupling between the storage and service manager is growing - // pretty tight; we should consider merging them together. - let storage_manager = - StorageManager::new(&log, underlay_etherstub.clone()).await; + let storage_manager = StorageManager::new(&log).await; let service_manager = ServiceManager::new( log.clone(), diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 027cf7dc88d..4a0db86ed59 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -103,6 +103,7 @@ async fn filesystem_put( let sa = rqctx.context(); let body_args = body.into_inner(); sa.filesystem_ensure( + body_args.id, body_args.zpool_id, body_args.dataset_kind, body_args.address, diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 0944aa62bba..9682fa3cc80 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -31,6 +31,7 @@ mod services; mod sled_agent; mod smf_helper; pub mod sp; +pub(crate) mod storage; mod storage_manager; mod updates; diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 104c1922ebd..3276e1d46ed 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -206,13 +206,12 @@ pub struct Zpool { /// The type of a dataset, and an auxiliary information necessary /// to successfully launch a zone managing the associated data. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, +)] #[serde(tag = "type", rename_all = "snake_case")] pub enum DatasetKind { - CockroachDb { - /// The addresses of all nodes within the cluster. - all_addresses: Vec, - }, + CockroachDb, Crucible, Clickhouse, } @@ -221,9 +220,7 @@ impl From for sled_agent_client::types::DatasetKind { fn from(k: DatasetKind) -> Self { use DatasetKind::*; match k { - CockroachDb { all_addresses } => Self::CockroachDb( - all_addresses.iter().map(|a| a.to_string()).collect(), - ), + CockroachDb => Self::CockroachDb, Crucible => Self::Crucible, Clickhouse => Self::Clickhouse, } @@ -333,6 +330,9 @@ pub enum ServiceType { Maghemite { mode: String, }, + Clickhouse, + CockroachDb, + Crucible, } impl std::fmt::Display for ServiceType { @@ -350,6 +350,9 @@ impl std::fmt::Display for ServiceType { ServiceType::BoundaryNtp { .. } | ServiceType::InternalNtp { .. } => write!(f, "ntp"), ServiceType::Maghemite { .. } => write!(f, "mg-ddm"), + ServiceType::Clickhouse => write!(f, "clickhouse"), + ServiceType::CockroachDb => write!(f, "cockroachdb"), + ServiceType::Crucible => write!(f, "crucible"), } } } @@ -427,6 +430,9 @@ impl From for sled_agent_client::types::ServiceType { AutoSt::InternalNtp { ntp_servers, dns_servers, domain } } St::Maghemite { mode } => AutoSt::Maghemite { mode }, + St::Clickhouse => AutoSt::Clickhouse, + St::CockroachDb => AutoSt::CockroachDb, + St::Crucible => AutoSt::Crucible, } } } @@ -437,25 +443,31 @@ impl From for sled_agent_client::types::ServiceType { )] #[serde(rename_all = "snake_case")] pub enum ZoneType { + Clickhouse, + CockroachDb, + CruciblePantry, + Crucible, ExternalDns, InternalDns, Nexus, + Ntp, Oximeter, Switch, - CruciblePantry, - Ntp, } impl From for sled_agent_client::types::ZoneType { fn from(zt: ZoneType) -> Self { match zt { + ZoneType::Clickhouse => Self::Clickhouse, + ZoneType::CockroachDb => Self::CockroachDb, + ZoneType::Crucible => Self::Crucible, + ZoneType::CruciblePantry => Self::CruciblePantry, ZoneType::InternalDns => Self::InternalDns, ZoneType::ExternalDns => Self::ExternalDns, ZoneType::Nexus => Self::Nexus, + ZoneType::Ntp => Self::Ntp, ZoneType::Oximeter => Self::Oximeter, ZoneType::Switch => Self::Switch, - ZoneType::CruciblePantry => Self::CruciblePantry, - ZoneType::Ntp => Self::Ntp, } } } @@ -464,13 +476,16 @@ impl std::fmt::Display for ZoneType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use ZoneType::*; let name = match self { + Clickhouse => "clickhouse", + CockroachDb => "cockroachdb", + Crucible => "crucible", + CruciblePantry => "crucible_pantry", ExternalDns => "external_dns", InternalDns => "internal_dns", Nexus => "nexus", + Ntp => "ntp", Oximeter => "oximeter", Switch => "switch", - CruciblePantry => "crucible_pantry", - Ntp => "ntp", }; write!(f, "{name}") } @@ -487,6 +502,9 @@ pub struct ServiceZoneRequest { pub zone_type: ZoneType, // The addresses on which the service should listen for requests. pub addresses: Vec, + // Datasets which should be managed by this service. + #[serde(default)] + pub dataset: Option, // The addresses in the global zone which should be created, if necessary // to route to the service. // @@ -511,13 +529,14 @@ impl From for sled_agent_client::types::ServiceZoneRequest { id: s.id, zone_type: s.zone_type.into(), addresses: s.addresses, + dataset: s.dataset.map(|d| d.into()), gz_addresses: s.gz_addresses, services, } } } -/// Used to request that the Sled initialize certain services on initialization. +/// Used to request that the Sled initialize certain services. /// /// This may be used to record that certain sleds are responsible for /// launching services which may not be associated with a dataset, such diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 3f2f4af54e2..9bc73ce22b3 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -296,6 +296,7 @@ impl Plan { id, zone_type: ZoneType::ExternalDns, addresses: vec![internal_ip], + dataset: None, gz_addresses: vec![], services: vec![ServiceType::ExternalDns { http_address: SocketAddrV6::new( @@ -329,6 +330,7 @@ impl Plan { id, zone_type: ZoneType::Nexus, addresses: vec![address], + dataset: None, gz_addresses: vec![], services: vec![ServiceType::Nexus { internal_ip: address, @@ -354,6 +356,7 @@ impl Plan { id, zone_type: ZoneType::Oximeter, addresses: vec![address], + dataset: None, gz_addresses: vec![], services: vec![ServiceType::Oximeter], }) @@ -373,9 +376,7 @@ impl Plan { request.datasets.push(DatasetEnsureBody { id, zpool_id: u2_zpools[0], - dataset_kind: crate::params::DatasetKind::CockroachDb { - all_addresses: vec![address], - }, + dataset_kind: crate::params::DatasetKind::CockroachDb, address, }); } @@ -444,6 +445,7 @@ impl Plan { id, zone_type: ZoneType::InternalDns, addresses: vec![dns_addr], + dataset: None, gz_addresses: vec![dns_subnet.gz_address().ip()], services: vec![ServiceType::InternalDns { http_address: SocketAddrV6::new( @@ -476,6 +478,7 @@ impl Plan { id, zone_type: ZoneType::CruciblePantry, addresses: vec![address], + dataset: None, gz_addresses: vec![], services: vec![ServiceType::CruciblePantry], }) @@ -523,6 +526,7 @@ impl Plan { id, zone_type: ZoneType::Ntp, addresses: vec![address], + dataset: None, gz_addresses: vec![], services, }); diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 0b3912e3ba6..4dbfc163be2 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -20,7 +20,7 @@ //! of what other services Nexus wants to have executing on the sled. //! //! To accomplish this, the following interfaces are exposed: -//! - [ServiceManager::ensure_persistent] exposes an API to request a set of +//! - [ServiceManager::ensure_all_services] exposes an API to request a set of //! services that should persist beyond reboot. //! - [ServiceManager::activate_switch] exposes an API to specifically enable //! or disable (via [ServiceManager::deactivate_switch]) the switch zone. @@ -50,7 +50,10 @@ use itertools::Itertools; use omicron_common::address::Ipv6Subnet; use omicron_common::address::AZ_PREFIX; use omicron_common::address::BOOTSTRAP_ARTIFACT_PORT; +use omicron_common::address::CLICKHOUSE_PORT; +use omicron_common::address::COCKROACH_PORT; use omicron_common::address::CRUCIBLE_PANTRY_PORT; +use omicron_common::address::CRUCIBLE_PORT; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGS_PORT; use omicron_common::address::NEXUS_INTERNAL_PORT; @@ -86,7 +89,9 @@ use tokio::task::JoinHandle; use uuid::Uuid; // The filename of ServiceManager's internal storage. -const SERVICE_CONFIG_FILENAME: &str = "service.toml"; +const SERVICES_CONFIG_FILENAME: &str = "services.toml"; +const STORAGE_SERVICES_CONFIG_FILENAME: &str = "storage-services.toml"; + // The filename of a half-completed config, in need of parameters supplied at // runtime. const PARTIAL_CONFIG_FILENAME: &str = "config-partial.toml"; @@ -194,10 +199,16 @@ impl From for omicron_common::api::external::Error { } } -/// The default path to service configuration, if one is not -/// explicitly provided. -pub fn default_services_config_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH).join(SERVICE_CONFIG_FILENAME) +// The default path to service configuration +fn default_services_config_path() -> PathBuf { + Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join(SERVICES_CONFIG_FILENAME) +} + +// The default path to storage service configuration +fn default_storage_services_config_path() -> PathBuf { + Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join(STORAGE_SERVICES_CONFIG_FILENAME) } /// Configuration parameters which modify the [`ServiceManager`]'s behavior. @@ -211,9 +222,10 @@ pub struct Config { /// An optional internet gateway address for external services. pub gateway_address: Option, - /// The path for the ServiceManager to store information about - /// all running services. - pub all_svcs_config_path: PathBuf, + // The path for the ServiceManager to store information about + // all running services. + all_svcs_config_path: PathBuf, + storage_svcs_config_path: PathBuf, } impl Config { @@ -227,6 +239,7 @@ impl Config { sidecar_revision, gateway_address, all_svcs_config_path: default_services_config_path(), + storage_svcs_config_path: default_storage_services_config_path(), } } } @@ -249,6 +262,7 @@ impl AllZoneRequests { #[derive(Clone, serde::Serialize, serde::Deserialize)] struct ZoneRequest { zone: ServiceZoneRequest, + // TODO: Consider collapsing "root" into ServiceZoneRequest root: PathBuf, } @@ -303,7 +317,10 @@ pub struct ServiceManagerInner { time_synced: AtomicBool, sidecar_revision: String, switch_zone_maghemite_links: Vec, + // Zones representing running services zones: Mutex>, + // Zones representing services which own datasets + dataset_zones: Mutex>, underlay_vnic_allocator: VnicAllocator, underlay_vnic: EtherstubVnic, bootstrap_vnic_allocator: VnicAllocator, @@ -373,6 +390,7 @@ impl ServiceManager { sidecar_revision, switch_zone_maghemite_links, zones: Mutex::new(vec![]), + dataset_zones: Mutex::new(vec![]), underlay_vnic_allocator: VnicAllocator::new( "Service", underlay_etherstub, @@ -521,6 +539,16 @@ impl ServiceManager { } } + // Returns either the path to the explicitly provided config path, or + // chooses the default one. + fn storage_services_config_path(&self) -> Result { + if let Some(info) = self.inner.sled_info.get() { + Ok(info.config.storage_svcs_config_path.clone()) + } else { + Err(Error::SledAgentNotReady) + } + } + // Advertise the /64 prefix of `address`, unless we already have. // // This method only blocks long enough to check our HashSet of @@ -864,6 +892,18 @@ impl ServiceManager { let opte_ports = self.opte_ports_needed(&request.zone).await?; let limit_priv = Self::privs_needed(&request.zone); + // If the zone is managing a particular dataset, plumb that + // dataset into the zone. Additionally, construct a "unique enough" name + // so we can create multiple zones of this type without collision. + let (unique_name, datasets) = + if let Some(dataset) = &request.zone.dataset { + ( + Some(dataset.pool().to_string()), + vec![zone::Dataset { name: dataset.full() }], + ) + } else { + (None, vec![]) + }; let devices: Vec = device_names .iter() .map(|d| zone::Device { name: d.to_string() }) @@ -874,11 +914,8 @@ impl ServiceManager { &self.inner.underlay_vnic_allocator, &request.root, &request.zone.zone_type.to_string(), - // unique_name= - None, - // dataset= - &[], - // filesystems= + unique_name.as_deref(), + &datasets, &filesystems, &devices, opte_ports, @@ -893,6 +930,107 @@ impl ServiceManager { // These zones are self-assembling -- after they boot, there should // be no "zlogin" necessary to initialize. match request.zone.zone_type { + ZoneType::Clickhouse => { + let Some(info) = self.inner.sled_info.get() else { + return Err(Error::SledAgentNotReady); + }; + let datalink = installed_zone.get_control_vnic_name(); + let gateway = &info.underlay_address.to_string(); + assert_eq!(request.zone.addresses.len(), 1); + let listen_addr = &request.zone.addresses[0].to_string(); + let listen_port = &CLICKHOUSE_PORT.to_string(); + + let config = PropertyGroupBuilder::new("config") + .add_property("datalink", "astring", datalink) + .add_property("gateway", "astring", gateway) + .add_property("listen_addr", "astring", listen_addr) + .add_property("listen_port", "astring", listen_port) + .add_property("store", "astring", "/data"); + + let profile = ProfileBuilder::new("omicron").add_service( + ServiceBuilder::new("oxide/clickhouse").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(config), + ), + ); + profile + .add_to_zone(&self.inner.log, &installed_zone) + .await + .map_err(|err| { + Error::io("Failed to setup clickhouse profile", err) + })?; + return Ok(RunningZone::boot(installed_zone).await?); + } + ZoneType::CockroachDb => { + let Some(info) = self.inner.sled_info.get() else { + return Err(Error::SledAgentNotReady); + }; + let datalink = installed_zone.get_control_vnic_name(); + let gateway = &info.underlay_address.to_string(); + assert_eq!(request.zone.addresses.len(), 1); + let listen_addr = &request.zone.addresses[0].to_string(); + let listen_port = &COCKROACH_PORT.to_string(); + + let config = PropertyGroupBuilder::new("config") + .add_property("datalink", "astring", datalink) + .add_property("gateway", "astring", gateway) + .add_property("listen_addr", "astring", listen_addr) + .add_property("listen_port", "astring", listen_port) + .add_property("store", "astring", "/data"); + + let profile = ProfileBuilder::new("omicron").add_service( + ServiceBuilder::new("oxide/cockroachdb").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(config), + ), + ); + profile + .add_to_zone(&self.inner.log, &installed_zone) + .await + .map_err(|err| { + Error::io("Failed to setup CRDB profile", err) + })?; + return Ok(RunningZone::boot(installed_zone).await?); + } + ZoneType::Crucible => { + let Some(info) = self.inner.sled_info.get() else { + return Err(Error::SledAgentNotReady); + }; + let datalink = installed_zone.get_control_vnic_name(); + let gateway = &info.underlay_address.to_string(); + assert_eq!(request.zone.addresses.len(), 1); + let listen_addr = &request.zone.addresses[0].to_string(); + let listen_port = &CRUCIBLE_PORT.to_string(); + + let dataset = request + .zone + .dataset + .as_ref() + .expect("Crucible requires dataset"); + let uuid = &Uuid::new_v4().to_string(); + let config = PropertyGroupBuilder::new("config") + .add_property("datalink", "astring", datalink) + .add_property("gateway", "astring", gateway) + .add_property("dataset", "astring", &dataset.full()) + .add_property("listen_addr", "astring", listen_addr) + .add_property("listen_port", "astring", listen_port) + .add_property("uuid", "astring", uuid) + .add_property("store", "astring", "/data"); + + let profile = ProfileBuilder::new("omicron").add_service( + ServiceBuilder::new("oxide/crucible/agent").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(config), + ), + ); + profile + .add_to_zone(&self.inner.log, &installed_zone) + .await + .map_err(|err| { + Error::io("Failed to setup crucible profile", err) + })?; + return Ok(RunningZone::boot(installed_zone).await?); + } ZoneType::CruciblePantry => { let Some(info) = self.inner.sled_info.get() else { return Err(Error::SledAgentNotReady); @@ -1324,9 +1462,6 @@ impl ServiceManager { smfh.setprop("config/port", &format!("{}", DENDRITE_PORT))?; smfh.refresh()?; } - ServiceType::CruciblePantry => { - panic!("CruciblePantry is self-assembling now") - } ServiceType::BoundaryNtp { ntp_servers, dns_servers, @@ -1450,6 +1585,12 @@ impl ServiceManager { smfh.refresh()?; } + ServiceType::Crucible + | ServiceType::CruciblePantry + | ServiceType::CockroachDb + | ServiceType::Clickhouse => { + panic!("{service} is a service which exists as part of a self-assembling zone") + } } debug!(self.inner.log, "enabling service"); @@ -1514,7 +1655,7 @@ impl ServiceManager { /// These services will be instantiated by this function, and will be /// recorded to a local file to ensure they start automatically on next /// boot. - pub async fn ensure_persistent( + pub async fn ensure_all_services( &self, request: ServiceEnsureBody, ) -> Result<(), Error> { @@ -1590,6 +1731,72 @@ impl ServiceManager { Ok(()) } + /// Ensures that a storage zone be initialized. + /// + /// These services will be instantiated by this function, and will be + /// recorded to a local file to ensure they start automatically on next + /// boot. + pub async fn ensure_storage_service( + &self, + request: ServiceZoneRequest, + ) -> Result<(), Error> { + let mut existing_zones = self.inner.dataset_zones.lock().await; + let config_path = self.storage_services_config_path()?; + + let mut zone_requests: AllZoneRequests = { + if config_path.exists() { + debug!(self.inner.log, "Reading old storage service requests"); + toml::from_str( + &tokio::fs::read_to_string(&config_path) + .await + .map_err(|err| Error::io_path(&config_path, err))?, + ) + .map_err(|err| Error::TomlDeserialize { + path: config_path.clone(), + err, + })? + } else { + debug!(self.inner.log, "No old storage service requests"); + AllZoneRequests::new() + } + }; + + if !zone_requests + .requests + .iter() + .any(|zone_request| zone_request.zone.id == request.id) + { + // If this is a new request, provision a zone filesystem on the same + // disk as the dataset. + let dataset = request + .dataset + .as_ref() + .expect("Storage services should have a dataset"); + let root = dataset + .pool() + .dataset_mountpoint(sled_hardware::disk::ZONE_DATASET); + zone_requests.requests.push(ZoneRequest { zone: request, root }); + } + + self.initialize_services_locked( + &mut existing_zones, + &zone_requests.requests, + ) + .await?; + + let serialized_services = toml::Value::try_from(&zone_requests) + .expect("Cannot serialize service list"); + let services_str = + toml::to_string(&serialized_services).map_err(|err| { + Error::TomlSerialize { path: config_path.clone(), err } + })?; + tokio::fs::write(&config_path, services_str) + .await + .map_err(|err| Error::io_path(&config_path, err))?; + + Ok(()) + } + pub fn boottime_rewrite(&self, zones: &Vec) { if self .inner @@ -1745,6 +1952,7 @@ impl ServiceManager { id: Uuid::new_v4(), zone_type: ZoneType::Switch, addresses, + dataset: None, gz_addresses: vec![], services, }; @@ -2099,11 +2307,12 @@ mod test { async fn ensure_new_service(mgr: &ServiceManager, id: Uuid) { let _expectations = expect_new_service(); - mgr.ensure_persistent(ServiceEnsureBody { + mgr.ensure_all_services(ServiceEnsureBody { services: vec![ServiceZoneRequest { id, zone_type: ZoneType::Oximeter, addresses: vec![Ipv6Addr::LOCALHOST], + dataset: None, gz_addresses: vec![], services: vec![ServiceType::Oximeter], }], @@ -2115,11 +2324,12 @@ mod test { // Prepare to call "ensure" for a service which already exists. We should // return the service without actually installing a new zone. async fn ensure_existing_service(mgr: &ServiceManager, id: Uuid) { - mgr.ensure_persistent(ServiceEnsureBody { + mgr.ensure_all_services(ServiceEnsureBody { services: vec![ServiceZoneRequest { id, zone_type: ZoneType::Oximeter, addresses: vec![Ipv6Addr::LOCALHOST], + dataset: None, gz_addresses: vec![], services: vec![ServiceType::Oximeter], }], @@ -2162,12 +2372,15 @@ mod test { fn make_config(&self) -> Config { let all_svcs_config_path = - self.config_dir.path().join(SERVICE_CONFIG_FILENAME); + self.config_dir.path().join(SERVICES_CONFIG_FILENAME); + let storage_svcs_config_path = + self.config_dir.path().join(STORAGE_SERVICES_CONFIG_FILENAME); Config { sled_id: Uuid::new_v4(), sidecar_revision: "rev_whatever_its_a_test".to_string(), gateway_address: None, all_svcs_config_path, + storage_svcs_config_path, } } } @@ -2190,11 +2403,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2239,11 +2448,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2291,11 +2496,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2331,11 +2532,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2380,11 +2577,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); @@ -2422,11 +2615,7 @@ mod test { "rev-test".to_string(), SWITCH_ZONE_BOOTSTRAP_IP, vec![], - StorageManager::new( - &log, - Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()), - ) - .await, + StorageManager::new(&log).await, ) .await .unwrap(); diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1831ac25941..1f731622180 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -12,7 +12,7 @@ use crate::params::{ DatasetKind, DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, ServiceEnsureBody, - SledRole, TimeSync, VpcFirewallRule, Zpool, + ServiceType, SledRole, TimeSync, VpcFirewallRule, ZoneType, Zpool, }; use crate::services::{self, ServiceManager}; use crate::storage_manager::{self, StorageManager}; @@ -254,7 +254,6 @@ impl SledAgent { storage .setup_underlay_access(storage_manager::UnderlayAccess { lazy_nexus_client: lazy_nexus_client.clone(), - underlay_address: *sled_address.ip(), sled_id: request.id, }) .await?; @@ -518,7 +517,7 @@ impl SledAgent { &self, requested_services: ServiceEnsureBody, ) -> Result<(), Error> { - self.inner.services.ensure_persistent(requested_services).await?; + self.inner.services.ensure_all_services(requested_services).await?; Ok(()) } @@ -540,14 +539,40 @@ impl SledAgent { /// Ensures that a filesystem type exists within the zpool. pub async fn filesystem_ensure( &self, - zpool_uuid: Uuid, + dataset_id: Uuid, + zpool_id: Uuid, dataset_kind: DatasetKind, address: SocketAddrV6, ) -> Result<(), Error> { - self.inner + // First, ensure the dataset exists + let dataset = self + .inner .storage - .upsert_filesystem(zpool_uuid, dataset_kind, address) + .upsert_filesystem(dataset_id, zpool_id, dataset_kind.clone()) .await?; + let (zone_type, services) = match dataset_kind { + DatasetKind::Clickhouse => { + (ZoneType::Clickhouse, vec![ServiceType::Clickhouse]) + } + DatasetKind::CockroachDb => { + (ZoneType::CockroachDb, vec![ServiceType::CockroachDb]) + } + DatasetKind::Crucible => { + (ZoneType::Crucible, vec![ServiceType::Crucible]) + } + }; + + // Next, ensure a zone exists to manage storage for that dataset + let request = crate::params::ServiceZoneRequest { + id: dataset_id, + zone_type, + addresses: vec![*address.ip()], + dataset: Some(dataset), + gz_addresses: vec![], + services, + }; + self.inner.services.ensure_storage_service(request).await?; + Ok(()) } diff --git a/sled-agent/src/storage/dataset.rs b/sled-agent/src/storage/dataset.rs new file mode 100644 index 00000000000..e36bc89e4e5 --- /dev/null +++ b/sled-agent/src/storage/dataset.rs @@ -0,0 +1,72 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::params::DatasetKind; +use illumos_utils::zpool::ZpoolKind; +use illumos_utils::zpool::ZpoolName; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive( + Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Clone, JsonSchema, +)] +pub struct DatasetName { + // A unique identifier for the Zpool on which the dataset is stored. + pool_name: ZpoolName, + // A name for the dataset within the Zpool. + kind: DatasetKind, +} + +impl DatasetName { + pub fn new(pool_name: ZpoolName, kind: DatasetKind) -> Self { + Self { pool_name, kind } + } + + pub fn pool(&self) -> &ZpoolName { + &self.pool_name + } + + pub fn dataset(&self) -> &DatasetKind { + &self.kind + } + + pub fn full(&self) -> String { + format!("{}/{}", self.pool_name, self.kind.to_string()) + } +} + +impl From for sled_agent_client::types::DatasetName { + fn from(n: DatasetName) -> Self { + let id = n.pool().id(); + + // NOTE: Ideally, this translation would live alongside the definitions + // of ZpoolKind and ZpoolName, but they're currently in illumos-utils, + // which has no dependency on sled_agent_client. + let kind = match n.pool().kind() { + ZpoolKind::External => { + sled_agent_client::types::ZpoolKind::External + } + ZpoolKind::Internal => { + sled_agent_client::types::ZpoolKind::Internal + } + }; + let pool_name = sled_agent_client::types::ZpoolName { id, kind }; + + Self { pool_name, kind: n.dataset().clone().into() } + } +} + +#[cfg(test)] +mod test { + use super::*; + use uuid::Uuid; + + #[test] + fn serialize_dataset_name() { + let pool = ZpoolName::new_internal(Uuid::new_v4()); + let kind = DatasetKind::Crucible; + let name = DatasetName::new(pool, kind); + toml::to_string(&name).unwrap(); + } +} diff --git a/sled-agent/src/storage/mod.rs b/sled-agent/src/storage/mod.rs new file mode 100644 index 00000000000..8444ecace40 --- /dev/null +++ b/sled-agent/src/storage/mod.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Management of local storage + +pub(crate) mod dataset; diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 142fa9f5685..8e23fc89297 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -6,35 +6,26 @@ use crate::nexus::LazyNexusClient; use crate::params::DatasetKind; -use crate::profile::*; +use crate::storage::dataset::DatasetName; use futures::stream::FuturesOrdered; use futures::FutureExt; use futures::StreamExt; -use illumos_utils::dladm::Etherstub; -use illumos_utils::link::VnicAllocator; -use illumos_utils::running_zone::{InstalledZone, RunningZone}; -use illumos_utils::zone::AddressRequest; use illumos_utils::zpool::{ZpoolKind, ZpoolName}; -use illumos_utils::{zfs::Mountpoint, zone::ZONE_PREFIX, zpool::ZpoolInfo}; +use illumos_utils::{zfs::Mountpoint, zpool::ZpoolInfo}; use nexus_client::types::PhysicalDiskDeleteRequest; use nexus_client::types::PhysicalDiskKind; use nexus_client::types::PhysicalDiskPutRequest; use nexus_client::types::ZpoolPutRequest; use omicron_common::api::external::{ByteCount, ByteCountRangeError}; use omicron_common::backoff; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; use sled_hardware::{Disk, DiskIdentity, DiskVariant, UnparsedDisk}; use slog::Logger; use std::collections::hash_map; use std::collections::HashMap; use std::convert::TryFrom; -use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; -use tokio::fs::{create_dir_all, File}; -use tokio::io::AsyncWriteExt; use tokio::sync::{mpsc, oneshot, Mutex}; use tokio::task::JoinHandle; use uuid::Uuid; @@ -90,6 +81,9 @@ pub enum Error { err: uuid::Error, }, + #[error("Dataset {name:?} exists with a different uuid (has {old}, requested {new})")] + UuidMismatch { name: DatasetName, old: Uuid, new: Uuid }, + #[error("Error parsing pool {name}'s size: {err}")] BadPoolSize { name: String, @@ -132,18 +126,10 @@ pub enum Error { UnderlayNotInitialized, } -impl Error { - fn io(message: &str, err: std::io::Error) -> Self { - Self::Io { message: message.to_string(), err } - } -} - /// A ZFS storage pool. struct Pool { name: ZpoolName, info: ZpoolInfo, - // ZFS filesytem UUID -> Zone. - zones: HashMap, parent: DiskIdentity, } @@ -153,312 +139,12 @@ impl Pool { /// Returns Ok if the pool exists. fn new(name: ZpoolName, parent: DiskIdentity) -> Result { let info = Zpool::get_info(&name.to_string())?; - Ok(Pool { name, info, zones: HashMap::new(), parent }) - } - - /// Associate an already running zone with this pool object. - /// - /// Typically this is used when a dataset within the zone (identified - /// by ID) has a running zone (e.g. Crucible, Cockroach) operating on - /// behalf of that data. - fn add_zone(&mut self, id: Uuid, zone: RunningZone) { - self.zones.insert(id, zone); - } - - /// Access a zone managing data within this pool. - fn get_zone(&self, id: Uuid) -> Option<&RunningZone> { - self.zones.get(&id) - } - - /// Returns the ID of the pool itself. - fn id(&self) -> Uuid { - self.name.id() + Ok(Pool { name, info, parent }) } fn parent(&self) -> &DiskIdentity { &self.parent } - - /// Returns the path for the configuration of a particular - /// dataset within the pool. This configuration file provides - /// the necessary information for zones to "launch themselves" - /// after a reboot. - async fn dataset_config_path( - &self, - dataset_id: Uuid, - ) -> Result { - let path = std::path::Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(self.id().to_string()); - create_dir_all(&path).await.map_err(|err| Error::Io { - message: format!("creating config dir {path:?}, which would contain config for {dataset_id}"), - err, - })?; - let mut path = path.join(dataset_id.to_string()); - path.set_extension("toml"); - Ok(path) - } -} - -#[derive(Debug, Serialize, Deserialize, JsonSchema, Clone)] -struct DatasetName { - // A unique identifier for the Zpool on which the dataset is stored. - pool_name: ZpoolName, - // A name for the dataset within the Zpool. - dataset_name: String, -} - -impl DatasetName { - fn new(pool_name: ZpoolName, dataset_name: &str) -> Self { - Self { pool_name, dataset_name: dataset_name.to_string() } - } - - fn full(&self) -> String { - format!("{}/{}", self.pool_name, self.dataset_name) - } -} - -// Description of a dataset within a ZFS pool, which should be created -// by the Sled Agent. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -struct DatasetInfo { - address: SocketAddrV6, - kind: DatasetKind, - name: DatasetName, -} - -impl DatasetInfo { - fn new( - pool: ZpoolName, - kind: DatasetKind, - address: SocketAddrV6, - ) -> DatasetInfo { - match kind { - DatasetKind::CockroachDb { .. } => DatasetInfo { - name: DatasetName::new(pool, "cockroachdb"), - address, - kind, - }, - DatasetKind::Clickhouse { .. } => DatasetInfo { - name: DatasetName::new(pool, "clickhouse"), - address, - kind, - }, - DatasetKind::Crucible { .. } => DatasetInfo { - name: DatasetName::new(pool, "crucible"), - address, - kind, - }, - } - } - - fn zone_prefix(&self) -> String { - format!("{}{}_", ZONE_PREFIX, self.name.full()) - } -} - -// Ensures that a zone backing a particular dataset is running. -async fn ensure_running_zone( - log: &Logger, - vnic_allocator: &VnicAllocator, - dataset_info: &DatasetInfo, - dataset_name: &DatasetName, - do_format: bool, - underlay_address: Ipv6Addr, -) -> Result { - let address_request = AddressRequest::new_static( - IpAddr::V6(*dataset_info.address.ip()), - None, - ); - - let err = RunningZone::get( - log, - &vnic_allocator, - &dataset_info.zone_prefix(), - address_request, - ) - .await; - match err { - Ok(zone) => { - info!(log, "Zone for {} is already running", dataset_name.full()); - return Ok(zone); - } - Err(illumos_utils::running_zone::GetZoneError::NotFound { .. }) => { - info!(log, "Zone for {} was not found", dataset_name.full()); - - let zone_root_path = dataset_name - .pool_name - .dataset_mountpoint(sled_hardware::disk::ZONE_DATASET); - info!( - log, - "Installing zone {} to {}", - dataset_name.full(), - zone_root_path.display() - ); - let installed_zone = InstalledZone::install( - log, - vnic_allocator, - &zone_root_path, - &dataset_info.name.dataset_name, - Some(&dataset_name.pool_name.to_string()), - &[zone::Dataset { name: dataset_name.full() }], - &[], - &[], - vec![], - None, - vec![], - vec![], - ) - .await?; - - let datalink = installed_zone.get_control_vnic_name(); - let gateway = &underlay_address.to_string(); - let listen_addr = &dataset_info.address.ip().to_string(); - let listen_port = &dataset_info.address.port().to_string(); - - let zone = match dataset_info.kind { - DatasetKind::CockroachDb { .. } => { - let config = PropertyGroupBuilder::new("config") - .add_property("datalink", "astring", datalink) - .add_property("gateway", "astring", gateway) - .add_property("listen_addr", "astring", listen_addr) - .add_property("listen_port", "astring", listen_port) - .add_property("store", "astring", "/data"); - - let profile = ProfileBuilder::new("omicron").add_service( - ServiceBuilder::new("oxide/cockroachdb").add_instance( - ServiceInstanceBuilder::new("default") - .add_property_group(config), - ), - ); - profile.add_to_zone(log, &installed_zone).await.map_err( - |err| Error::io("Failed to setup CRDB profile", err), - )?; - let zone = RunningZone::boot(installed_zone).await?; - - // Await liveness of the cluster. - info!(log, "start_zone: awaiting liveness of CRDB"); - let check_health = || async { - let http_addr = SocketAddrV6::new( - *dataset_info.address.ip(), - 8080, - 0, - 0, - ); - reqwest::get(format!( - "http://{}/health?ready=1", - http_addr - )) - .await - .map_err(backoff::BackoffError::transient) - }; - let log_failure = |_, call_count, total_duration| { - if call_count == 0 { - info!(log, "cockroachdb not yet alive"); - } else if total_duration - > std::time::Duration::from_secs(5) - { - warn!(log, "cockroachdb not yet alive"; "total duration" => ?total_duration); - } - }; - backoff::retry_notify_ext( - backoff::retry_policy_internal_service(), - check_health, - log_failure, - ) - .await - .expect("expected an infinite retry loop waiting for crdb"); - - info!(log, "CRDB is online"); - // If requested, format the cluster with the initial tables. - if do_format { - info!(log, "Formatting CRDB"); - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - &dataset_info.address.to_string(), - "--file", - "/opt/oxide/cockroachdb/sql/dbwipe.sql", - ])?; - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - &dataset_info.address.to_string(), - "--file", - "/opt/oxide/cockroachdb/sql/dbinit.sql", - ])?; - info!(log, "Formatting CRDB - Completed"); - } - - zone - } - DatasetKind::Clickhouse { .. } => { - let config = PropertyGroupBuilder::new("config") - .add_property("datalink", "astring", datalink) - .add_property("gateway", "astring", gateway) - .add_property("listen_addr", "astring", listen_addr) - .add_property("listen_port", "astring", listen_port) - .add_property("store", "astring", "/data"); - - let profile = ProfileBuilder::new("omicron").add_service( - ServiceBuilder::new("oxide/clickhouse").add_instance( - ServiceInstanceBuilder::new("default") - .add_property_group(config), - ), - ); - profile.add_to_zone(log, &installed_zone).await.map_err( - |err| { - Error::io("Failed to setup clickhouse profile", err) - }, - )?; - RunningZone::boot(installed_zone).await? - } - DatasetKind::Crucible => { - let dataset = &dataset_info.name.full(); - let uuid = &Uuid::new_v4().to_string(); - let config = PropertyGroupBuilder::new("config") - .add_property("datalink", "astring", datalink) - .add_property("gateway", "astring", gateway) - .add_property("dataset", "astring", dataset) - .add_property("listen_addr", "astring", listen_addr) - .add_property("listen_port", "astring", listen_port) - .add_property("uuid", "astring", uuid) - .add_property("store", "astring", "/data"); - - let profile = ProfileBuilder::new("omicron").add_service( - ServiceBuilder::new("oxide/crucible/agent") - .add_instance( - ServiceInstanceBuilder::new("default") - .add_property_group(config), - ), - ); - profile.add_to_zone(log, &installed_zone).await.map_err( - |err| { - Error::io("Failed to setup crucible profile", err) - }, - )?; - RunningZone::boot(installed_zone).await? - } - }; - Ok(zone) - } - Err(illumos_utils::running_zone::GetZoneError::NotRunning { - name, - state, - }) => { - // TODO(https://github.com/oxidecomputer/omicron/issues/725): - unimplemented!("Handle a zone which exists, but is not running: {name}, in {state:?}"); - } - Err(err) => { - // TODO(https://github.com/oxidecomputer/omicron/issues/725): - unimplemented!( - "Handle a zone which exists, has some other problem: {err}" - ); - } - } } // The type of a future which is used to send a notification to Nexus. @@ -467,10 +153,10 @@ type NotifyFut = #[derive(Debug)] struct NewFilesystemRequest { + dataset_id: Uuid, zpool_id: Uuid, dataset_kind: DatasetKind, - address: SocketAddrV6, - responder: oneshot::Sender>, + responder: oneshot::Sender>, } struct UnderlayRequest { @@ -558,7 +244,6 @@ impl StorageResources { /// Describes the access to the underlay used by the StorageManager. pub struct UnderlayAccess { pub lazy_nexus_client: LazyNexusClient, - pub underlay_address: Ipv6Addr, pub sled_id: Uuid, } @@ -567,7 +252,6 @@ struct StorageWorker { log: Logger, nexus_notifications: FuturesOrdered, rx: mpsc::Receiver, - vnic_allocator: VnicAllocator, underlay: Arc>>, } @@ -582,17 +266,16 @@ impl StorageWorker { // creating it if `do_format` is true. // // Returns the UUID attached to the ZFS filesystem. - fn ensure_dataset_with_id( + fn ensure_dataset( + &mut self, + dataset_id: Uuid, dataset_name: &DatasetName, - do_format: bool, - ) -> Result { + ) -> Result<(), Error> { let zoned = true; let fs_name = &dataset_name.full(); + let do_format = true; Zfs::ensure_filesystem( - &format!( - "{}/{}", - dataset_name.pool_name, dataset_name.dataset_name - ), + &dataset_name.full(), Mountpoint::Path(PathBuf::from("/data")), zoned, do_format, @@ -600,70 +283,18 @@ impl StorageWorker { // Ensure the dataset has a usable UUID. if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { if let Ok(id) = id_str.parse::() { - return Ok(id); + if id != dataset_id { + return Err(Error::UuidMismatch { + name: dataset_name.clone(), + old: id, + new: dataset_id, + }); + } + return Ok(()); } } - let id = Uuid::new_v4(); - Zfs::set_oxide_value(&fs_name, "uuid", &id.to_string())?; - Ok(id) - } - - // Starts the zone for a dataset within a particular zpool. - // - // If requested via the `do_format` parameter, may also initialize - // these resources. - // - // Returns the UUID attached to the underlying ZFS dataset. - // Returns (was_inserted, Uuid). - async fn initialize_dataset_and_zone( - &mut self, - pool: &mut Pool, - dataset_info: &DatasetInfo, - do_format: bool, - ) -> Result<(bool, Uuid), Error> { - // Ensure the underlying dataset exists before trying to poke at zones. - let dataset_name = &dataset_info.name; - info!(&self.log, "Ensuring dataset {} exists", dataset_name.full()); - let id = - StorageWorker::ensure_dataset_with_id(&dataset_name, do_format)?; - - // If this zone has already been processed by us, return immediately. - if let Some(_) = pool.get_zone(id) { - return Ok((false, id)); - } - // Otherwise, the zone may or may not exist. - // We need to either look up or create the zone. - info!( - &self.log, - "Ensuring zone for {} is running", - dataset_name.full() - ); - - let underlay_guard = self.underlay.lock().await; - let Some(underlay) = underlay_guard.as_ref() else { - return Err(Error::UnderlayNotInitialized); - }; - let underlay_address = underlay.underlay_address; - drop(underlay_guard); - - let zone = ensure_running_zone( - &self.log, - &self.vnic_allocator, - dataset_info, - &dataset_name, - do_format, - underlay_address, - ) - .await?; - - info!( - &self.log, - "Zone {} with address {} is running", - zone.name(), - dataset_info.address, - ); - pool.add_zone(id, zone); - Ok((true, id)) + Zfs::set_oxide_value(&fs_name, "uuid", &dataset_id.to_string())?; + Ok(()) } // Adds a "notification to nexus" to `nexus_notifications`, @@ -1012,34 +643,6 @@ impl StorageWorker { })?; // Notify Nexus of the zpool. self.add_zpool_notify(&pool, size); - - // If we find filesystems within our datasets, ensure their - // zones are up-and-running. - let mut datasets = vec![]; - let existing_filesystems = Zfs::list_datasets(&pool_name.to_string())?; - for fs_name in existing_filesystems { - info!( - &self.log, - "StorageWorker loading fs {} on zpool {}", - fs_name, - pool_name.to_string() - ); - // We intentionally do not exit on error here - - // otherwise, the failure of a single dataset would - // stop the storage manager from processing all storage. - // - // Instead, we opt to log the failure. - let dataset_name = DatasetName::new(pool_name.clone(), &fs_name); - let result = self.load_dataset(pool, &dataset_name).await; - match result { - Ok(dataset) => datasets.push(dataset), - Err(e) => warn!( - &self.log, - "StorageWorker Failed to load dataset: {}", e - ), - } - } - Ok(()) } @@ -1048,7 +651,7 @@ impl StorageWorker { &mut self, resources: &StorageResources, request: &NewFilesystemRequest, - ) -> Result<(), Error> { + ) -> Result { info!(self.log, "add_dataset: {:?}", request); let mut pools = resources.pools.lock().await; let pool = pools.get_mut(&request.zpool_id).ok_or_else(|| { @@ -1057,80 +660,10 @@ impl StorageWorker { request.zpool_id )) })?; - - let dataset_info = DatasetInfo::new( - pool.name.clone(), - request.dataset_kind.clone(), - request.address, - ); - let (is_new_dataset, id) = self - .initialize_dataset_and_zone( - pool, - &dataset_info, - // do_format= - true, - ) - .await?; - - if !is_new_dataset { - return Ok(()); - } - - // Now that the dataset has been initialized, record the configuration - // so it can re-initialize itself after a reboot. - let path = pool.dataset_config_path(id).await?; - let info_str = toml::to_string(&dataset_info) - .map_err(|err| Error::Serialize { path: path.clone(), err })?; - let pool_name = &pool.name; - let mut file = File::create(&path).await.map_err(|err| Error::Io { - message: format!("Failed creating config file at {path:?} for pool {pool_name}, dataset: {id}"), - err, - })?; - file.write_all(info_str.as_bytes()).await.map_err(|err| Error::Io { - message: format!("Failed writing config to {path:?} for pool {pool_name}, dataset: {id}"), - err, - })?; - - Ok(()) - } - - async fn load_dataset( - &mut self, - pool: &mut Pool, - dataset_name: &DatasetName, - ) -> Result<(Uuid, SocketAddrV6, DatasetKind), Error> { - let name = dataset_name.full(); - let id = Zfs::get_oxide_value(&name, "uuid")? - .parse::() - .map_err(|err| Error::ParseDatasetUuid { name, err })?; - let config_path = pool.dataset_config_path(id).await?; - info!( - self.log, - "Loading Dataset from {}", - config_path.to_string_lossy() - ); - let pool_name = pool.info.name(); - let dataset_info: DatasetInfo = - toml::from_str( - &tokio::fs::read_to_string(&config_path).await.map_err(|err| Error::Io { - message: format!("read config for pool {pool_name}, dataset {dataset_name:?} from {config_path:?}"), - err, - })? - ).map_err(|err| { - Error::Deserialize { - path: config_path, - err, - } - })?; - self.initialize_dataset_and_zone( - pool, - &dataset_info, - // do_format= - false, - ) - .await?; - - Ok((id, dataset_info.address, dataset_info.kind)) + let dataset_name = + DatasetName::new(pool.name.clone(), request.dataset_kind.clone()); + self.ensure_dataset(request.dataset_id, &dataset_name)?; + Ok(dataset_name) } // Small wrapper around `Self::do_work_internal` that ensures we always @@ -1221,14 +754,13 @@ pub struct StorageManager { impl StorageManager { /// Creates a new [`StorageManager`] which should manage local storage. - pub async fn new(log: &Logger, etherstub: Etherstub) -> Self { + pub async fn new(log: &Logger) -> Self { let log = log.new(o!("component" => "StorageManager")); let resources = StorageResources { disks: Arc::new(Mutex::new(HashMap::new())), pools: Arc::new(Mutex::new(HashMap::new())), }; let (tx, rx) = mpsc::channel(30); - let vnic_allocator = VnicAllocator::new("Storage", etherstub); StorageManager { inner: Arc::new(StorageManagerInner { @@ -1240,7 +772,6 @@ impl StorageManager { log, nexus_notifications: FuturesOrdered::new(), rx, - vnic_allocator, underlay: Arc::new(Mutex::new(None)), }; @@ -1351,15 +882,15 @@ impl StorageManager { pub async fn upsert_filesystem( &self, + dataset_id: Uuid, zpool_id: Uuid, dataset_kind: DatasetKind, - address: SocketAddrV6, - ) -> Result<(), Error> { + ) -> Result { let (tx, rx) = oneshot::channel(); let request = NewFilesystemRequest { + dataset_id, zpool_id, dataset_kind, - address, responder: tx, }; @@ -1367,13 +898,13 @@ impl StorageManager { .tx .send(StorageWorkerRequest::NewFilesystem(request)) .await - .map_err(|_| ()) + .map_err(|e| e.to_string()) .expect("Storage worker bug (not alive)"); - rx.await.expect( + let dataset_name = rx.await.expect( "Storage worker bug (dropped responder without responding)", )?; - Ok(()) + Ok(dataset_name) } pub fn resources(&self) -> &StorageResources { @@ -1392,22 +923,3 @@ impl Drop for StorageManagerInner { self.task.abort(); } } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn serialize_dataset_info() { - let dataset_info = DatasetInfo { - address: "[::1]:8080".parse().unwrap(), - kind: DatasetKind::Crucible, - name: DatasetName::new( - ZpoolName::new_internal(Uuid::new_v4()), - "dataset", - ), - }; - - toml::to_string(&dataset_info).unwrap(); - } -} diff --git a/smf/cockroachdb/method_script.sh b/smf/cockroachdb/method_script.sh index fb9bacf854f..de017428294 100755 --- a/smf/cockroachdb/method_script.sh +++ b/smf/cockroachdb/method_script.sh @@ -28,3 +28,9 @@ args=( ) exec /opt/oxide/cockroachdb/bin/cockroach start-single-node "${args[@]}" & + +# TODO: Do this from RSS; I don't think we can make this call locally +# since there may be multiple instances running... + +# /opt/oxide/cockroachdb/bin/cockroach sql --insecure --host "$FULL_ADDRESS" --file /opt/oxide/cockroachdb/sql/dbwipe.sql +# /opt/oxide/cockroachdb/bin/cockroach sql --insecure --host "$FULL_ADDRESS" --file /opt/oxide/cockroachdb/sql/dbinit.sql From 0b4b04037634004d251193802719d65ae290a24b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 28 Apr 2023 12:19:16 -0400 Subject: [PATCH 02/39] CRDB auto-format on boot --- sled-agent/src/services.rs | 67 +++++++++++++++++++++++++++++-- sled-agent/src/storage/dataset.rs | 2 +- smf/cockroachdb/method_script.sh | 6 --- 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 4dbfc163be2..f9c2251e551 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -968,8 +968,12 @@ impl ServiceManager { let datalink = installed_zone.get_control_vnic_name(); let gateway = &info.underlay_address.to_string(); assert_eq!(request.zone.addresses.len(), 1); - let listen_addr = &request.zone.addresses[0].to_string(); - let listen_port = &COCKROACH_PORT.to_string(); + let address = SocketAddr::new( + IpAddr::V6(request.zone.addresses[0]), + COCKROACH_PORT, + ); + let listen_addr = &address.ip().to_string(); + let listen_port = &address.port().to_string(); let config = PropertyGroupBuilder::new("config") .add_property("datalink", "astring", datalink) @@ -990,7 +994,64 @@ impl ServiceManager { .map_err(|err| { Error::io("Failed to setup CRDB profile", err) })?; - return Ok(RunningZone::boot(installed_zone).await?); + let running_zone = RunningZone::boot(installed_zone).await?; + + // TODO: The following lines are necessary to initialize CRDB + // in a single-node environment. They're bad! They're wrong! + // We definitely shouldn't be wiping the database every time + // we want to boot this zone. + // + // But they're also necessary to prevent the build from + // regressing. + // + // NOTE: In the (very short-term) future, this will be + // replaced by the following: + // 1. CRDB will simply "start", rather than "start-single-node". + // 2. The Sled Agent will expose an explicit API to "init" the + // Cockroach cluster, and populate it with the expected + // contents. + let format_crdb = || async { + info!(self.inner.log, "Formatting CRDB"); + running_zone + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + &address.to_string(), + "--file", + "/opt/oxide/cockroachdb/sql/dbwipe.sql", + ]) + .map_err(BackoffError::transient)?; + running_zone + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + &address.to_string(), + "--file", + "/opt/oxide/cockroachdb/sql/dbinit.sql", + ]) + .map_err(BackoffError::transient)?; + info!(self.inner.log, "Formatting CRDB - Completed"); + Ok::< + (), + BackoffError< + illumos_utils::running_zone::RunCommandError, + >, + >(()) + }; + let log_failure = |error, _| { + warn!( + self.inner.log, "failed to format CRDB"; + "error" => ?error, + ); + }; + retry_notify(retry_policy_local(), format_crdb, log_failure) + .await + .expect("expected an infinite retry loop waiting for crdb"); + return Ok(running_zone); } ZoneType::Crucible => { let Some(info) = self.inner.sled_info.get() else { diff --git a/sled-agent/src/storage/dataset.rs b/sled-agent/src/storage/dataset.rs index e36bc89e4e5..d4f0d63474e 100644 --- a/sled-agent/src/storage/dataset.rs +++ b/sled-agent/src/storage/dataset.rs @@ -32,7 +32,7 @@ impl DatasetName { } pub fn full(&self) -> String { - format!("{}/{}", self.pool_name, self.kind.to_string()) + format!("{}/{}", self.pool_name, self.kind) } } diff --git a/smf/cockroachdb/method_script.sh b/smf/cockroachdb/method_script.sh index de017428294..fb9bacf854f 100755 --- a/smf/cockroachdb/method_script.sh +++ b/smf/cockroachdb/method_script.sh @@ -28,9 +28,3 @@ args=( ) exec /opt/oxide/cockroachdb/bin/cockroach start-single-node "${args[@]}" & - -# TODO: Do this from RSS; I don't think we can make this call locally -# since there may be multiple instances running... - -# /opt/oxide/cockroachdb/bin/cockroach sql --insecure --host "$FULL_ADDRESS" --file /opt/oxide/cockroachdb/sql/dbwipe.sql -# /opt/oxide/cockroachdb/bin/cockroach sql --insecure --host "$FULL_ADDRESS" --file /opt/oxide/cockroachdb/sql/dbinit.sql From ef9517c127f23e7535a6f9194623fbcf9733dae5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sat, 29 Apr 2023 22:28:51 -0400 Subject: [PATCH 03/39] better use of 'unique_name' (for storage zones), auto-launch storage zones, zone_name -> zone_type, config -> ledger --- illumos-utils/src/running_zone.rs | 12 +- sled-agent/src/params.rs | 15 ++ sled-agent/src/services.rs | 396 +++++++++++++++--------------- 3 files changed, 224 insertions(+), 199 deletions(-) diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 7fb8d67be3a..b2f3c5a76fd 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -574,8 +574,8 @@ impl InstalledZone { /// /// This results in a zone name which is distinct across different zpools, /// but stable and predictable across reboots. - pub fn get_zone_name(zone_name: &str, unique_name: Option<&str>) -> String { - let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_name); + pub fn get_zone_name(zone_type: &str, unique_name: Option<&str>) -> String { + let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_type); if let Some(suffix) = unique_name { zone_name.push_str(&format!("_{}", suffix)); } @@ -600,7 +600,7 @@ impl InstalledZone { log: &Logger, underlay_vnic_allocator: &VnicAllocator, zone_root_path: &Path, - zone_name: &str, + zone_type: &str, unique_name: Option<&str>, datasets: &[zone::Dataset], filesystems: &[zone::Fs], @@ -613,14 +613,14 @@ impl InstalledZone { let control_vnic = underlay_vnic_allocator.new_control(None).map_err(|err| { InstallZoneError::CreateVnic { - zone: zone_name.to_string(), + zone: zone_type.to_string(), err, } })?; - let full_zone_name = Self::get_zone_name(zone_name, unique_name); + let full_zone_name = Self::get_zone_name(zone_type, unique_name); let zone_image_path = - PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_name)); + PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type)); let net_device_names: Vec = opte_ports .iter() diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 3276e1d46ed..8418cd3888c 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -518,6 +518,21 @@ pub struct ServiceZoneRequest { pub services: Vec, } +impl ServiceZoneRequest { + // The full name of the zone, if it was to be created as a zone. + pub fn zone_name(&self) -> String { + illumos_utils::running_zone::InstalledZone::get_zone_name( + &self.zone_type.to_string(), + self.zone_name_unique_identifier().as_deref(), + ) + } + + // The name of a unique identifier for the zone, if one is necessary. + pub fn zone_name_unique_identifier(&self) -> Option { + self.dataset.as_ref().map(|d| d.pool().to_string()) + } +} + impl From for sled_agent_client::types::ServiceZoneRequest { fn from(s: ServiceZoneRequest) -> Self { let mut services = Vec::new(); diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index f9c2251e551..9b9171de72a 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -88,17 +88,6 @@ use tokio::sync::Mutex; use tokio::task::JoinHandle; use uuid::Uuid; -// The filename of ServiceManager's internal storage. -const SERVICES_CONFIG_FILENAME: &str = "services.toml"; -const STORAGE_SERVICES_CONFIG_FILENAME: &str = "storage-services.toml"; - -// The filename of a half-completed config, in need of parameters supplied at -// runtime. -const PARTIAL_CONFIG_FILENAME: &str = "config-partial.toml"; -// The filename of a completed config, merging the partial config with -// additional appended parameters known at runtime. -const COMPLETE_CONFIG_FILENAME: &str = "config.toml"; - #[derive(thiserror::Error, Debug)] pub enum Error { #[error("Cannot serialize TOML to file {path}: {err}")] @@ -199,18 +188,6 @@ impl From for omicron_common::api::external::Error { } } -// The default path to service configuration -fn default_services_config_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(SERVICES_CONFIG_FILENAME) -} - -// The default path to storage service configuration -fn default_storage_services_config_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(STORAGE_SERVICES_CONFIG_FILENAME) -} - /// Configuration parameters which modify the [`ServiceManager`]'s behavior. pub struct Config { /// Identifies the sled being configured @@ -224,8 +201,8 @@ pub struct Config { // The path for the ServiceManager to store information about // all running services. - all_svcs_config_path: PathBuf, - storage_svcs_config_path: PathBuf, + all_svcs_ledger_path: PathBuf, + storage_svcs_ledger_path: PathBuf, } impl Config { @@ -238,12 +215,42 @@ impl Config { sled_id, sidecar_revision, gateway_address, - all_svcs_config_path: default_services_config_path(), - storage_svcs_config_path: default_storage_services_config_path(), + all_svcs_ledger_path: default_services_ledger_path(), + storage_svcs_ledger_path: default_storage_services_ledger_path(), } } } +// The filename of ServiceManager's internal storage. +const SERVICES_CONFIG_FILENAME: &str = "services.toml"; +const STORAGE_SERVICES_CONFIG_FILENAME: &str = "storage-services.toml"; + +// The default path to service configuration +fn default_services_ledger_path() -> PathBuf { + Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join(SERVICES_CONFIG_FILENAME) +} + +// The default path to storage service configuration +fn default_storage_services_ledger_path() -> PathBuf { + Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join(STORAGE_SERVICES_CONFIG_FILENAME) +} + +// TODO(ideas): +// - "ServiceLedger" +// - Manages the serializable "AllZoneRequests" object +// - Constructor which reads from config location (kinda like +// "read_from") +// - ... Writer which *knows the type* to be serialized, so can direct it to the +// appropriate output path. +// +// - TODO: later: Can also make the path writing safer, by... +// - ... TODO: Writing to both M.2s, basically using multiple output paths +// - ... TODO: Using a temporary file and renaming it to make the update atomic +// - ... TODO: Add a .json EXPECTORATE test for the format of "AllZoneRequests" +// - we need to be careful not to break compatibility in the future. + // A wrapper around `ZoneRequest`, which allows it to be serialized // to a toml file. #[derive(Clone, serde::Serialize, serde::Deserialize)] @@ -255,6 +262,44 @@ impl AllZoneRequests { fn new() -> Self { Self { requests: vec![] } } + + // Reads from `path` as a toml-serialized version of `Self`. + async fn read_from(log: &Logger, path: &Path) -> Result { + if path.exists() { + debug!( + log, + "Reading old storage service requests from {}", + path.display() + ); + toml::from_str( + &tokio::fs::read_to_string(&path) + .await + .map_err(|err| Error::io_path(&path, err))?, + ) + .map_err(|err| Error::TomlDeserialize { + path: path.to_path_buf(), + err, + }) + } else { + debug!(log, "No old storage service requests"); + Ok(AllZoneRequests::new()) + } + } + + // Writes to `path` as a toml-serialized version of `Self`. + async fn write_to(&self, log: &Logger, path: &Path) -> Result<(), Error> { + debug!(log, "Writing zone request configuration to {}", path.display()); + let serialized_services = toml::Value::try_from(&self) + .expect("Cannot serialize service list"); + let services_str = + toml::to_string(&serialized_services).map_err(|err| { + Error::TomlSerialize { path: path.to_path_buf(), err } + })?; + tokio::fs::write(&path, services_str) + .await + .map_err(|err| Error::io_path(&path, err))?; + Ok(()) + } } // This struct represents the combo of "what zone did you ask for" + "where did @@ -414,6 +459,93 @@ impl ServiceManager { self.inner.switch_zone_bootstrap_address } + pub async fn load_non_storage_services(&self) -> Result<(), Error> { + let log = &self.inner.log; + let services = + AllZoneRequests::read_from(log, &self.services_ledger_path()?) + .await?; + let mut existing_zones = self.inner.zones.lock().await; + + // Initialize and DNS and NTP services first as they are required + // for time synchronization, which is a pre-requisite for the other + // services. + self.initialize_services_locked( + &mut existing_zones, + &services + .requests + .clone() + .into_iter() + .filter(|svc| { + matches!( + svc.zone.zone_type, + ZoneType::InternalDns | ZoneType::Ntp + ) + }) + .collect(), + ) + .await?; + + drop(existing_zones); + + info!(&self.inner.log, "Waiting for sled time synchronization"); + + retry_notify( + retry_policy_local(), + || async { + match self.timesync_get().await { + Ok(TimeSync { sync: true, .. }) => { + info!(&self.inner.log, "Time is synchronized"); + Ok(()) + } + Ok(ts) => Err(BackoffError::transient(format!( + "No sync {:?}", + ts + ))), + Err(e) => Err(BackoffError::transient(format!( + "Error checking for time synchronization: {}", + e + ))), + } + }, + |error, delay| { + warn!( + self.inner.log, + "Time not yet synchronised (retrying in {:?})", + delay; + "error" => ?error + ); + }, + ) + .await + .expect("Expected an infinite retry loop syncing time"); + + let mut existing_zones = self.inner.zones.lock().await; + + // Initialize all remaining serivces + self.initialize_services_locked( + &mut existing_zones, + &services.requests, + ) + .await?; + Ok(()) + } + + pub async fn load_storage_services(&self) -> Result<(), Error> { + let log = &self.inner.log; + let services = AllZoneRequests::read_from( + log, + &self.storage_services_ledger_path()?, + ) + .await?; + let mut existing_zones = self.inner.dataset_zones.lock().await; + self.initialize_services_locked( + &mut existing_zones, + &services.requests, + ) + .await?; + Ok(()) + } + /// Loads services from the services manager, and returns once all requested /// services have been started. pub async fn sled_agent_started( @@ -438,88 +570,14 @@ impl ServiceManager { .map_err(|_| "already set".to_string()) .expect("Sled Agent should only start once"); - let config_path = self.services_config_path()?; - if config_path.exists() { - info!( - &self.inner.log, - "Sled services found at {}; loading", - config_path.to_string_lossy() - ); - let cfg: AllZoneRequests = toml::from_str( - &tokio::fs::read_to_string(&config_path) - .await - .map_err(|err| Error::io_path(&config_path, err))?, - ) - .map_err(|err| Error::TomlDeserialize { - path: config_path.clone(), - err, - })?; - let mut existing_zones = self.inner.zones.lock().await; - - // Initialize and DNS and NTP services first as they are required - // for time synchronization, which is a pre-requisite for the other - // services. - self.initialize_services_locked( - &mut existing_zones, - &cfg.requests - .clone() - .into_iter() - .filter(|svc| { - matches!( - svc.zone.zone_type, - ZoneType::InternalDns | ZoneType::Ntp - ) - }) - .collect(), - ) - .await?; - - drop(existing_zones); - - info!(&self.inner.log, "Waiting for sled time synchronization"); - - retry_notify( - retry_policy_local(), - || async { - match self.timesync_get().await { - Ok(TimeSync { sync: true, .. }) => { - info!(&self.inner.log, "Time is synchronized"); - Ok(()) - } - Ok(ts) => Err(BackoffError::transient(format!( - "No sync {:?}", - ts - ))), - Err(e) => Err(BackoffError::transient(format!( - "Error checking for time synchronization: {}", - e - ))), - } - }, - |error, delay| { - warn!( - self.inner.log, - "Time not yet synchronised (retrying in {:?})", - delay; - "error" => ?error - ); - }, - ) - .await - .expect("Expected an infinite retry loop syncing time"); - - let mut existing_zones = self.inner.zones.lock().await; - - // Initialize all remaining serivces - self.initialize_services_locked(&mut existing_zones, &cfg.requests) - .await?; - } else { - info!( - &self.inner.log, - "No sled services found at {}", - config_path.to_string_lossy() - ); - } + self.load_non_storage_services().await?; + // TODO: These will fail if the disks aren't attached. + // Should we have a retry loop here? Kinda like we have with the switch + // / NTP zone? + // + // NOTE: We could totally do the same thing with + // "load_non_storage_services". + self.load_storage_services().await?; Ok(()) } @@ -529,21 +587,21 @@ impl ServiceManager { self.inner.sled_mode } - // Returns either the path to the explicitly provided config path, or + // Returns either the path to the explicitly provided ledger path, or // chooses the default one. - fn services_config_path(&self) -> Result { + fn services_ledger_path(&self) -> Result { if let Some(info) = self.inner.sled_info.get() { - Ok(info.config.all_svcs_config_path.clone()) + Ok(info.config.all_svcs_ledger_path.clone()) } else { Err(Error::SledAgentNotReady) } } - // Returns either the path to the explicitly provided config path, or + // Returns either the path to the explicitly provided ledger path, or // chooses the default one. - fn storage_services_config_path(&self) -> Result { + fn storage_services_ledger_path(&self) -> Result { if let Some(info) = self.inner.sled_info.get() { - Ok(info.config.storage_svcs_config_path.clone()) + Ok(info.config.storage_svcs_ledger_path.clone()) } else { Err(Error::SledAgentNotReady) } @@ -895,15 +953,14 @@ impl ServiceManager { // If the zone is managing a particular dataset, plumb that // dataset into the zone. Additionally, construct a "unique enough" name // so we can create multiple zones of this type without collision. - let (unique_name, datasets) = - if let Some(dataset) = &request.zone.dataset { - ( - Some(dataset.pool().to_string()), - vec![zone::Dataset { name: dataset.full() }], - ) - } else { - (None, vec![]) - }; + let unique_name = request.zone.zone_name_unique_identifier(); + let datasets = request + .zone + .dataset + .iter() + .map(|d| zone::Dataset { name: d.full() }) + .collect::>(); + let devices: Vec = device_names .iter() .map(|d| zone::Device { name: d.to_string() }) @@ -915,7 +972,7 @@ impl ServiceManager { &request.root, &request.zone.zone_type.to_string(), unique_name.as_deref(), - &datasets, + datasets.as_slice(), &filesystems, &devices, opte_ports, @@ -1286,6 +1343,12 @@ impl ServiceManager { "{}/var/svc/manifest/site/nexus", running_zone.root() )); + // The filename of a half-completed config, in need of parameters supplied at + // runtime. + const PARTIAL_CONFIG_FILENAME: &str = "config-partial.toml"; + // The filename of a completed config, merging the partial config with + // additional appended parameters known at runtime. + const COMPLETE_CONFIG_FILENAME: &str = "config.toml"; let partial_config_path = config_dir.join(PARTIAL_CONFIG_FILENAME); let config_path = config_dir.join(COMPLETE_CONFIG_FILENAME); @@ -1682,10 +1745,7 @@ impl ServiceManager { ); // Before we bother allocating anything for this request, check if // this service has already been created. - let expected_zone_name = InstalledZone::get_zone_name( - &req.zone.zone_type.to_string(), - None, - ); + let expected_zone_name = req.zone.zone_name(); if existing_zones.iter().any(|z| z.name() == expected_zone_name) { info!( self.inner.log, @@ -1721,25 +1781,10 @@ impl ServiceManager { request: ServiceEnsureBody, ) -> Result<(), Error> { let mut existing_zones = self.inner.zones.lock().await; - let config_path = self.services_config_path()?; + let ledger_path = self.services_ledger_path()?; - let old_zone_requests: AllZoneRequests = { - if config_path.exists() { - debug!(self.inner.log, "Reading old service requests"); - toml::from_str( - &tokio::fs::read_to_string(&config_path) - .await - .map_err(|err| Error::io_path(&config_path, err))?, - ) - .map_err(|err| Error::TomlDeserialize { - path: config_path.clone(), - err, - })? - } else { - debug!(self.inner.log, "No old service requests"); - AllZoneRequests::new() - } - }; + let old_zone_requests = + AllZoneRequests::read_from(&self.inner.log, &ledger_path).await?; let new_zone_requests: Vec = { let known_set: HashSet<&ServiceZoneRequest> = HashSet::from_iter( @@ -1747,6 +1792,7 @@ impl ServiceManager { ); let requested_set = HashSet::from_iter(request.services.iter()); + // TODO: We probably want to handle this case. if !requested_set.is_superset(&known_set) { // The caller may only request services additively. // @@ -1779,15 +1825,7 @@ impl ServiceManager { .await?; zone_requests.requests.append(&mut old_zone_requests.requests.clone()); - let serialized_services = toml::Value::try_from(&zone_requests) - .expect("Cannot serialize service list"); - let services_str = - toml::to_string(&serialized_services).map_err(|err| { - Error::TomlSerialize { path: config_path.clone(), err } - })?; - tokio::fs::write(&config_path, services_str) - .await - .map_err(|err| Error::io_path(&config_path, err))?; + zone_requests.write_to(&self.inner.log, &ledger_path).await?; Ok(()) } @@ -1802,25 +1840,10 @@ impl ServiceManager { request: ServiceZoneRequest, ) -> Result<(), Error> { let mut existing_zones = self.inner.dataset_zones.lock().await; - let config_path = self.storage_services_config_path()?; + let ledger_path = self.storage_services_ledger_path()?; - let mut zone_requests: AllZoneRequests = { - if config_path.exists() { - debug!(self.inner.log, "Reading old storage service requests"); - toml::from_str( - &tokio::fs::read_to_string(&config_path) - .await - .map_err(|err| Error::io_path(&config_path, err))?, - ) - .map_err(|err| Error::TomlDeserialize { - path: config_path.clone(), - err, - })? - } else { - debug!(self.inner.log, "No old storage service requests"); - AllZoneRequests::new() - } - }; + let mut zone_requests = + AllZoneRequests::read_from(&self.inner.log, &ledger_path).await?; if !zone_requests .requests @@ -1845,15 +1868,7 @@ impl ServiceManager { ) .await?; - let serialized_services = toml::Value::try_from(&zone_requests) - .expect("Cannot serialize service list"); - let services_str = - toml::to_string(&serialized_services).map_err(|err| { - Error::TomlSerialize { path: config_path.clone(), err } - })?; - tokio::fs::write(&config_path, services_str) - .await - .map_err(|err| Error::io_path(&config_path, err))?; + zone_requests.write_to(&self.inner.log, &ledger_path).await?; Ok(()) } @@ -2423,25 +2438,20 @@ mod test { impl TestConfig { async fn new() -> Self { let config_dir = tempfile::TempDir::new().unwrap(); - tokio::fs::File::create( - config_dir.path().join(PARTIAL_CONFIG_FILENAME), - ) - .await - .unwrap(); Self { config_dir } } fn make_config(&self) -> Config { - let all_svcs_config_path = + let all_svcs_ledger_path = self.config_dir.path().join(SERVICES_CONFIG_FILENAME); - let storage_svcs_config_path = + let storage_svcs_ledger_path = self.config_dir.path().join(STORAGE_SERVICES_CONFIG_FILENAME); Config { sled_id: Uuid::new_v4(), sidecar_revision: "rev_whatever_its_a_test".to_string(), gateway_address: None, - all_svcs_config_path, - storage_svcs_config_path, + all_svcs_ledger_path, + storage_svcs_ledger_path, } } } @@ -2663,7 +2673,7 @@ mod test { // Next, delete the config. This means the service we just created will // not be remembered on the next initialization. let config = test_config.make_config(); - std::fs::remove_file(&config.all_svcs_config_path).unwrap(); + std::fs::remove_file(&config.all_svcs_ledger_path).unwrap(); // Observe that the old service is not re-initialized. let mgr = ServiceManager::new( From 4377f1d7fde458a1ef7c6260ad9178042eba532d Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sun, 30 Apr 2023 01:45:37 -0400 Subject: [PATCH 04/39] Stop deleting chelsio addresses during uninstall (#2953) ## Before this PR Running on rack2 and calling `omicron-package uninstall` would involve a fatal termination of the connection, as it would delete the `cxgbe0/ll` and `cxgbe1/ll` IP addresses necessary for contacting the sled. ## After this PR Those addresses are left alone. This is pretty useful for development, as it allows us to run `uninstall` to cleanly wipe a Gimlet, preparing it for future "clean installs". --- sled-hardware/src/cleanup.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/sled-hardware/src/cleanup.rs b/sled-hardware/src/cleanup.rs index 0c14aad7dfd..1a7f8be2f70 100644 --- a/sled-hardware/src/cleanup.rs +++ b/sled-hardware/src/cleanup.rs @@ -6,11 +6,11 @@ use anyhow::Error; use futures::stream::{self, StreamExt, TryStreamExt}; +use illumos_utils::dladm::Dladm; use illumos_utils::dladm::BOOTSTRAP_ETHERSTUB_NAME; use illumos_utils::dladm::BOOTSTRAP_ETHERSTUB_VNIC_NAME; use illumos_utils::dladm::UNDERLAY_ETHERSTUB_NAME; use illumos_utils::dladm::UNDERLAY_ETHERSTUB_VNIC_NAME; -use illumos_utils::dladm::{Dladm, VnicSource}; use illumos_utils::link::LinkKind; use illumos_utils::opte; use illumos_utils::zone::IPADM; @@ -30,14 +30,6 @@ pub fn delete_bootstrap_addresses(log: &Logger) -> Result<(), Error> { delete_addresses_matching_prefixes(log, &[bootstrap_prefix]) } -fn delete_chelsio_addresses(log: &Logger) -> Result<(), Error> { - let prefixes = crate::underlay::find_chelsio_links()? - .into_iter() - .map(|link| format!("{}/", link.name())) - .collect::>(); - delete_addresses_matching_prefixes(log, &prefixes) -} - fn delete_addresses_matching_prefixes( log: &Logger, prefixes: &[String], @@ -112,7 +104,6 @@ pub async fn delete_omicron_vnics(log: &Logger) -> Result<(), Error> { pub async fn cleanup_networking_resources(log: &Logger) -> Result<(), Error> { delete_underlay_addresses(log)?; delete_bootstrap_addresses(log)?; - delete_chelsio_addresses(log)?; delete_omicron_vnics(log).await?; delete_etherstub(log)?; opte::delete_all_xde_devices(log)?; From ec3b1e4b1a38e695555d6bc1d28cddb3b5e1d483 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sun, 30 Apr 2023 01:29:03 -0400 Subject: [PATCH 05/39] [RSS] Explicit set of Bootstrap Agents --- openapi/sled-agent.json | 3 +- sled-agent/src/bootstrap/params.rs | 13 ++ sled-agent/src/rack_setup/config.rs | 2 + sled-agent/src/rack_setup/plan/sled.rs | 4 +- sled-agent/src/rack_setup/service.rs | 123 +++--------------- .../gimlet-standalone/config-rss.toml | 3 + smf/sled-agent/non-gimlet/config-rss.toml | 3 + 7 files changed, 45 insertions(+), 106 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index c41da8ab8b2..ad26e7a609f 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1745,7 +1745,7 @@ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, "ServiceEnsureBody": { - "description": "Used to request that the Sled initialize certain services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", + "description": "Used to request that the Sled initialize certain services on initialization.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", "type": "object", "properties": { "services": { @@ -2138,6 +2138,7 @@ ] }, "ServiceZoneService": { + "description": "Used to request that the Sled initialize certain services.", "type": "object", "properties": { "details": { diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 0fc17cb8e01..146f0dac481 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -11,9 +11,19 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use serde_with::serde_as; use std::borrow::Cow; +use std::collections::HashSet; use std::net::{Ipv4Addr, Ipv6Addr, SocketAddrV6}; use uuid::Uuid; +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum BootstrapAddressDiscovery { + /// Ignore all bootstrap addresses except our own. + OnlyOurs, + /// Ignore all bootstrap addresses except the following. + OnlyThese(HashSet), +} + /// Configuration for the "rack setup service". /// /// The Rack Setup Service should be responsible for one-time setup actions, @@ -24,6 +34,9 @@ use uuid::Uuid; pub struct RackInitializeRequest { pub rack_subnet: Ipv6Addr, + /// Describes how bootstrap addresses should be collected during RSS. + pub bootstrap_discovery: BootstrapAddressDiscovery, + /// The minimum number of sleds required to unlock the rack secret. /// /// If this value is less than 2, no rack secret will be created on startup; diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index cba9a9f44f0..9957d9257fd 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -39,6 +39,7 @@ impl SetupServiceConfig { #[cfg(test)] mod test { use super::*; + use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::Gateway; use omicron_common::address::IpRange; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -47,6 +48,7 @@ mod test { fn test_subnets() { let cfg = SetupServiceConfig { rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), + bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, rack_secret_threshold: 0, gateway: Some(Gateway { address: None, diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index a754aac8e59..f78a4c24845 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -13,7 +13,7 @@ use crate::rack_setup::config::SetupServiceConfig as Config; use serde::{Deserialize, Serialize}; use slog::Logger; use sprockets_host::Ed25519Certificate; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::{Ipv6Addr, SocketAddrV6}; use std::path::{Path, PathBuf}; use thiserror::Error; @@ -119,7 +119,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, - bootstrap_addrs: Vec, + bootstrap_addrs: HashSet, ) -> Result { let rack_id = Uuid::new_v4(); diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 1586cfb5de4..ba9310afe9e 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -56,6 +56,7 @@ use super::config::SetupServiceConfig as Config; use crate::bootstrap::config::BOOTSTRAP_AGENT_HTTP_PORT; +use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::SledAgentRequest; use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::nexus::d2n_params; @@ -94,9 +95,6 @@ use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::path::PathBuf; use thiserror::Error; -// The minimum number of sleds to initialize the rack. -const MINIMUM_SLED_COUNT: usize = 1; - /// Describes errors which may occur while operating the setup service. #[derive(Error, Debug)] pub enum SetupServiceError { @@ -217,24 +215,6 @@ fn rss_completed_marker_path() -> PathBuf { .join("rss-plan-completed.marker") } -// Describes the options when awaiting for peers. -enum PeerExpectation { - // Await a set of peers that matches this group of IPv6 addresses exactly. - // - // TODO: We currently don't deal with the case where: - // - // - RSS boots, sees some sleds, comes up with a plan. - // - RSS reboots, sees a *different* set of sleds, and needs - // to adjust the plan. - // - // This case is fairly tricky because some sleds may have - // already received requests to initialize - modifying the - // allocated subnets would be non-trivial. - LoadOldPlan(HashSet), - // Await any peers, as long as there are at least enough to make a new plan. - CreateNewPlan(usize), -} - /// The implementation of the Rack Setup Service. struct ServiceInner { log: Logger, @@ -458,75 +438,6 @@ impl ServiceInner { Ok(()) } - /// Waits for sufficient neighbors to exist so the initial set of requests - /// can be sent out. - async fn wait_for_peers( - &self, - expectation: PeerExpectation, - our_bootstrap_address: Ipv6Addr, - ) -> Result, DdmError> { - let ddm_admin_client = DdmAdminClient::localhost(&self.log)?; - let addrs = retry_notify( - retry_policy_internal_service_aggressive(), - || async { - let peer_addrs = ddm_admin_client - .derive_bootstrap_addrs_from_prefixes(&[ - BootstrapInterface::GlobalZone, - ]) - .await - .map_err(|err| { - BackoffError::transient(format!( - "Failed getting peers from mg-ddm: {err}" - )) - })?; - - let all_addrs = peer_addrs - .chain(iter::once(our_bootstrap_address)) - .collect::>(); - - match expectation { - PeerExpectation::LoadOldPlan(ref expected) => { - if all_addrs.is_superset(expected) { - Ok(all_addrs.into_iter().collect()) - } else { - Err(BackoffError::transient( - concat!( - "Waiting for a LoadOldPlan set ", - "of peers not found yet." - ) - .to_string(), - )) - } - } - PeerExpectation::CreateNewPlan(wanted_peer_count) => { - if all_addrs.len() >= wanted_peer_count { - Ok(all_addrs.into_iter().collect()) - } else { - Err(BackoffError::transient(format!( - "Waiting for {} peers (currently have {})", - wanted_peer_count, - all_addrs.len() - ))) - } - } - } - }, - |message, duration| { - info!( - self.log, - "{} (will retry after {:?})", message, duration - ); - }, - ) - // `retry_policy_internal_service_aggressive()` retries indefinitely on - // transient errors (the only kind we produce), allowing us to - // `.unwrap()` without panicking - .await - .unwrap(); - - Ok(addrs) - } - async fn sled_timesync( &self, sled_address: &SocketAddrV6, @@ -936,19 +847,25 @@ impl ServiceInner { // Wait for either: // - All the peers to re-load an old plan (if one exists) // - Enough peers to create a new plan (if one does not exist) - let maybe_sled_plan = SledPlan::load(&self.log).await?; - let expectation = if let Some(plan) = &maybe_sled_plan { - PeerExpectation::LoadOldPlan( - plan.sleds.keys().map(|a| *a.ip()).collect(), - ) - } else { - PeerExpectation::CreateNewPlan(MINIMUM_SLED_COUNT) + let bootstrap_addrs = match &config.bootstrap_discovery { + BootstrapAddressDiscovery::OnlyOurs => { + HashSet::from([local_bootstrap_agent.our_address()]) + } + BootstrapAddressDiscovery::OnlyThese(peers) => peers.clone(), }; - - let addrs = self - .wait_for_peers(expectation, local_bootstrap_agent.our_address()) - .await?; - info!(self.log, "Enough peers exist to enact RSS plan"); + let maybe_sled_plan = SledPlan::load(&self.log).await?; + if let Some(plan) = &maybe_sled_plan { + let stored_peers: HashSet = + plan.sleds.keys().map(|a| *a.ip()).collect(); + if stored_peers != bootstrap_addrs { + return Err(SetupServiceError::BadConfig("Set of sleds requested does not match those in existing sled plan".to_string())); + } + } + if bootstrap_addrs.is_empty() { + return Err(SetupServiceError::BadConfig( + "Must request at least one peer".to_string(), + )); + } // If we created a plan, reuse it. Otherwise, create a new plan. // @@ -961,7 +878,7 @@ impl ServiceInner { plan } else { info!(self.log, "Creating new allocation plan"); - SledPlan::create(&self.log, config, addrs).await? + SledPlan::create(&self.log, config, bootstrap_addrs).await? }; let config = &plan.config; diff --git a/smf/sled-agent/gimlet-standalone/config-rss.toml b/smf/sled-agent/gimlet-standalone/config-rss.toml index 05f28605028..246f57848da 100644 --- a/smf/sled-agent/gimlet-standalone/config-rss.toml +++ b/smf/sled-agent/gimlet-standalone/config-rss.toml @@ -6,6 +6,9 @@ # |...............| <- This /56 is the Rack Subnet rack_subnet = "fd00:1122:3344:0100::" +# Only include "our own sled" in the bootstrap network +bootstrap_discovery = "only_ours" + # The number of sleds required to unlock the rack secret. # # For values less than 2, no rack secret will be generated. diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 05f28605028..246f57848da 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -6,6 +6,9 @@ # |...............| <- This /56 is the Rack Subnet rack_subnet = "fd00:1122:3344:0100::" +# Only include "our own sled" in the bootstrap network +bootstrap_discovery = "only_ours" + # The number of sleds required to unlock the rack secret. # # For values less than 2, no rack secret will be generated. From 9d00c936dfdea3090ccc9fefd9be2085749414de Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sun, 30 Apr 2023 09:19:07 -0400 Subject: [PATCH 06/39] Fix tests --- openapi/sled-agent.json | 3 ++- sled-agent/src/params.rs | 4 ++-- sled-agent/src/services.rs | 19 +++++++++++-------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index c41da8ab8b2..a138ba1537a 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1745,7 +1745,7 @@ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, "ServiceEnsureBody": { - "description": "Used to request that the Sled initialize certain services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", + "description": "Used to request that the Sled initialize multiple services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", "type": "object", "properties": { "services": { @@ -2138,6 +2138,7 @@ ] }, "ServiceZoneService": { + "description": "Used to request that the Sled initialize a single service.", "type": "object", "properties": { "details": { diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 0b80c434b4d..d8a9cd5337a 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -575,7 +575,7 @@ impl From for sled_agent_client::types::ServiceZoneRequest { } } -/// Used to request that the Sled initialize certain services. +/// Used to request that the Sled initialize a single service. #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, )] @@ -590,7 +590,7 @@ impl From for sled_agent_client::types::ServiceZoneService { } } -/// Used to request that the Sled initialize certain services on initialization. +/// Used to request that the Sled initialize multiple services. /// /// This may be used to record that certain sleds are responsible for /// launching services which may not be associated with a dataset, such diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 77976bf33b9..d8366c85f6a 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -464,9 +464,11 @@ impl ServiceManager { pub async fn load_non_storage_services(&self) -> Result<(), Error> { let log = &self.inner.log; - let services = - AllZoneRequests::read_from(log, &self.services_ledger_path()?) - .await?; + let ledger = self.services_ledger_path()?; + if !ledger.exists() { + return Ok(()); + } + let services = AllZoneRequests::read_from(log, &ledger).await?; let mut existing_zones = self.inner.zones.lock().await; // Initialize and DNS and NTP services first as they are required @@ -535,11 +537,11 @@ impl ServiceManager { pub async fn load_storage_services(&self) -> Result<(), Error> { let log = &self.inner.log; - let services = AllZoneRequests::read_from( - log, - &self.storage_services_ledger_path()?, - ) - .await?; + let ledger = self.storage_services_ledger_path()?; + if !ledger.exists() { + return Ok(()); + } + let services = AllZoneRequests::read_from(log, &ledger).await?; let mut existing_zones = self.inner.dataset_zones.lock().await; self.initialize_services_locked( &mut existing_zones, @@ -558,6 +560,7 @@ impl ServiceManager { underlay_address: Ipv6Addr, rack_id: Uuid, ) -> Result<(), Error> { + debug!(&self.inner.log, "sled agent started"; "underlay_address" => underlay_address.to_string()); self.inner .sled_info .set(SledAgentInfo { From cfb7cbc61ae370390c8bc1c6d41f1f220f928371 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sun, 30 Apr 2023 15:59:42 -0400 Subject: [PATCH 07/39] make serialization happier --- sled-agent/src/bootstrap/params.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 146f0dac481..9c01c75af92 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -16,7 +16,7 @@ use std::net::{Ipv4Addr, Ipv6Addr, SocketAddrV6}; use uuid::Uuid; #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] -#[serde(rename_all = "snake_case")] +#[serde(rename_all = "snake_case", tag = "type")] pub enum BootstrapAddressDiscovery { /// Ignore all bootstrap addresses except our own. OnlyOurs, From 2ab628b75e4f59432371469526c74b56e93e76b7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 May 2023 11:49:50 -0400 Subject: [PATCH 08/39] Store service ledgers in duplicate in M.2s --- sled-agent/src/ledger.rs | 376 ++++++++++++++++++++++++++++++ sled-agent/src/lib.rs | 1 + sled-agent/src/services.rs | 254 ++++++++++---------- sled-agent/src/storage_manager.rs | 9 + 4 files changed, 512 insertions(+), 128 deletions(-) create mode 100644 sled-agent/src/ledger.rs diff --git a/sled-agent/src/ledger.rs b/sled-agent/src/ledger.rs new file mode 100644 index 00000000000..5462630b553 --- /dev/null +++ b/sled-agent/src/ledger.rs @@ -0,0 +1,376 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Utilities to help reading/writing toml files from/to multiple paths + +use async_trait::async_trait; +use serde::{de::DeserializeOwned, Serialize}; +use slog::Logger; +use std::path::{Path, PathBuf}; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Cannot serialize TOML to file {path}: {err}")] + TomlSerialize { path: PathBuf, err: toml::ser::Error }, + + #[error("Cannot deserialize TOML from file {path}: {err}")] + TomlDeserialize { path: PathBuf, err: toml::de::Error }, + + #[error("Failed to perform I/O: {message}: {err}")] + Io { + message: String, + #[source] + err: std::io::Error, + }, + + #[error("Failed to write the ledger to storage")] + FailedToAccessStorage, +} + +impl Error { + fn io_path(path: &Path, err: std::io::Error) -> Self { + Self::Io { message: format!("Error accessing {}", path.display()), err } + } +} + +impl From for omicron_common::api::external::Error { + fn from(err: Error) -> Self { + omicron_common::api::external::Error::InternalError { + internal_message: err.to_string(), + } + } +} + +// TODO: .json EXPECTORATE test? +// +// ... yes, but maybe not here? Seems like we gotta know the type of "T" to pull +// this off. + +/// Manage the serialization and deserialization of a ledger of information. +/// +/// This structure is intended to help with serialization and deserialization +/// of configuration information to both M.2s. +pub struct Ledger { + log: Logger, + ledger: T, + paths: Vec, +} + +impl Ledger { + /// Reads the ledger from any of the provided `paths`. + /// + /// Returns the following, in order: + /// - The ledger with the highest generation number + /// - If none exists, returns a default ledger + pub async fn new(log: &Logger, paths: Vec) -> Result { + // Read all the ledgers that we can. + let mut ledgers = vec![]; + for path in paths.iter() { + if let Ok(ledger) = T::read_from(log, &path).await { + ledgers.push(ledger); + } + } + + // Return the ledger with the highest generation number. + let ledger = ledgers.into_iter().reduce(|prior, ledger| { + if ledger.is_newer_than(&prior) { + ledger + } else { + prior + } + }); + + // If we can't read either ledger, start a new one. + let ledger = ledger.unwrap_or_else(|| T::default()); + + Ok(Self { log: log.clone(), ledger, paths }) + } + + pub fn data(&self) -> &T { + &self.ledger + } + + pub fn data_mut(&mut self) -> &mut T { + &mut self.ledger + } + + /// Writes the ledger back to all config directories. + /// + /// Succeeds if at least one of the writes succeeds. + pub async fn commit(&mut self) -> Result<(), Error> { + // Bump the generation number any time we want to commit the ledger. + self.ledger.generation_bump(); + + let mut one_successful_write = false; + for path in self.paths.iter() { + if let Err(e) = self.atomic_write(&path).await { + warn!(self.log, "Failed to write to {}: {e}", path.display()); + } else { + one_successful_write = true; + } + } + + if !one_successful_write { + return Err(Error::FailedToAccessStorage); + } + Ok(()) + } + + // Atomically serialize and write the ledger to storage. + // + // We accomplish this by first writing to a temporary file, then + // renaming to the target location. + async fn atomic_write(&self, path: &Path) -> Result<(), Error> { + let mut tmp_path = path.to_path_buf(); + let tmp_filename = format!( + ".{}.tmp", + tmp_path + .file_name() + .expect("Should have file name") + .to_string_lossy() + ); + tmp_path.set_file_name(tmp_filename); + + self.ledger.write_to(&self.log, &tmp_path).await?; + + tokio::fs::rename(&tmp_path, &path) + .await + .map_err(|err| Error::io_path(&path, err))?; + + Ok(()) + } +} + +#[async_trait] +pub trait Ledgerable: + Default + DeserializeOwned + Serialize + Send + Sync +{ + /// Returns true if [Self] is newer than `other`. + fn is_newer_than(&self, other: &Self) -> bool; + + /// Increments the gneration number. + fn generation_bump(&mut self); + + /// Reads from `path` as a toml-serialized version of `Self`. + async fn read_from(log: &Logger, path: &Path) -> Result { + if path.exists() { + debug!(log, "Reading ledger from {}", path.display()); + toml::from_str( + &tokio::fs::read_to_string(&path) + .await + .map_err(|err| Error::io_path(&path, err))?, + ) + .map_err(|err| Error::TomlDeserialize { + path: path.to_path_buf(), + err, + }) + } else { + debug!(log, "No ledger in {}", path.display()); + Ok(Self::default()) + } + } + + /// Writes to `path` as a toml-serialized version of `Self`. + async fn write_to(&self, log: &Logger, path: &Path) -> Result<(), Error> { + debug!(log, "Writing ledger to {}", path.display()); + let serialized = + toml::Value::try_from(&self).expect("Cannot serialize ledger"); + let as_str = toml::to_string(&serialized).map_err(|err| { + Error::TomlSerialize { path: path.to_path_buf(), err } + })?; + tokio::fs::write(&path, as_str) + .await + .map_err(|err| Error::io_path(&path, err))?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use omicron_test_utils::dev::test_setup_log; + + #[derive(Serialize, serde::Deserialize, Default, Eq, PartialEq, Debug)] + struct Data { + generation: u64, + contents: String, + } + + impl Ledgerable for Data { + fn is_newer_than(&self, other: &Self) -> bool { + self.generation > other.generation + } + + fn generation_bump(&mut self) { + self.generation = self.generation + 1; + } + } + + #[tokio::test] + async fn test_create_default_ledger() { + let logctx = test_setup_log("create_default_ledger"); + let log = &logctx.log; + + let config_dir = tempfile::TempDir::new().unwrap(); + let ledger = + Ledger::::new(&log, vec![config_dir.path().to_path_buf()]) + .await + .expect("Failed to create ledger"); + + // Since we haven't previously stored anything, expect to read a default + // value. + assert_eq!(ledger.data(), &Data::default()); + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_create_ledger_reads_from_storage() { + let logctx = test_setup_log("create_ledger_reads_from_storage"); + let log = &logctx.log; + + let config_dir = tempfile::TempDir::new().unwrap(); + let config_path = config_dir.path().join("ledger.toml"); + + // Create the ledger within a configuration directory + let mut ledger = Ledger::::new(&log, vec![config_path.clone()]) + .await + .expect("Failed to create ledger"); + ledger.data_mut().contents = "new contents".to_string(); + ledger.commit().await.expect("Failed to write ledger"); + assert!(config_path.exists()); + + drop(ledger); + + // Re-create the ledger, observe the new contents. + let ledger = Ledger::::new(&log, vec![config_path.clone()]) + .await + .expect("Failed to create ledger"); + + assert_eq!(ledger.data().contents, "new contents"); + assert_eq!(ledger.data().generation, 1); + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_create_ledger_reads_latest_from_storage() { + let logctx = test_setup_log("create_ledger_reads_latest_from_storage"); + let log = &logctx.log; + + // Create the ledger, initialize contents. + let config_dirs = vec![ + tempfile::TempDir::new().unwrap(), + tempfile::TempDir::new().unwrap(), + ]; + let config_paths = config_dirs + .iter() + .map(|d| d.path().join("ledger.toml")) + .collect::>(); + + let mut ledger = Ledger::::new(&log, config_paths.clone()) + .await + .expect("Failed to create ledger"); + ledger.data_mut().contents = "new contents".to_string(); + ledger.commit().await.expect("Failed to write ledger"); + + assert!(config_paths[0].exists()); + assert!(config_paths[1].exists()); + + drop(ledger); + + // Let's write again, but only using one of the two config dirs. + let mut ledger = Ledger::::new(&log, config_paths[..=1].to_vec()) + .await + .expect("Failed to create ledger"); + ledger.data_mut().contents = "even newer contents".to_string(); + ledger.commit().await.expect("Failed to write ledger"); + + drop(ledger); + + // Re-create the ledger (using both config dirs), observe the newest contents. + let ledger = Ledger::::new(&log, config_paths.clone()) + .await + .expect("Failed to create ledger"); + + assert_eq!(ledger.data().contents, "even newer contents"); + assert_eq!(ledger.data().generation, 2); + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_commit_handles_write_failures() { + let logctx = test_setup_log("create_commit_handles_write_failures"); + let log = &logctx.log; + + // Create the ledger, initialize contents. + let mut config_dirs = vec![ + tempfile::TempDir::new().unwrap(), + tempfile::TempDir::new().unwrap(), + ]; + let config_paths = config_dirs + .iter() + .map(|d| d.path().join("ledger.toml")) + .collect::>(); + + let mut ledger = Ledger::::new(&log, config_paths.clone()) + .await + .expect("Failed to create ledger"); + ledger.data_mut().contents = "written to both configs".to_string(); + ledger.commit().await.expect("Failed to write ledger"); + + assert!(config_paths[0].exists()); + assert!(config_paths[1].exists()); + + drop(ledger); + + // Remove one of the config directories, try again. + // + // We should still be able to read and write the ledger. + config_dirs.remove(1); + assert!(config_paths[0].exists()); + assert!(!config_paths[1].exists()); + + let mut ledger = Ledger::::new(&log, config_paths.clone()) + .await + .expect("Failed to create ledger"); + + assert_eq!(ledger.data().contents, "written to both configs"); + assert_eq!(ledger.data().generation, 1); + ledger.data_mut().contents = "written to one config".to_string(); + ledger.commit().await.expect("Failed to write ledger"); + + drop(ledger); + + // We can still parse the ledger from a single path + let ledger = Ledger::::new(&log, config_paths.clone()) + .await + .expect("Failed to create ledger"); + assert_eq!(ledger.data().contents, "written to one config"); + assert_eq!(ledger.data().generation, 2); + + drop(ledger); + + // Remove the last config directory, try again. + // + // We should not be able to write the ledger. + drop(config_dirs); + assert!(!config_paths[0].exists()); + assert!(!config_paths[1].exists()); + + let mut ledger = Ledger::::new(&log, config_paths.clone()) + .await + .expect("Failed to create ledger"); + + assert_eq!(ledger.data(), &Data::default()); + let err = ledger.commit().await.unwrap_err(); + assert!( + matches!(err, Error::FailedToAccessStorage), + "Unexpected error: {err}" + ); + + logctx.cleanup_successful(); + } +} diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 9682fa3cc80..0a17f99e280 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -22,6 +22,7 @@ pub mod config; mod http_entrypoints; mod instance; mod instance_manager; +mod ledger; mod nexus; pub mod params; mod profile; diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index d8366c85f6a..d270d9b29aa 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -25,6 +25,7 @@ //! - [ServiceManager::activate_switch] exposes an API to specifically enable //! or disable (via [ServiceManager::deactivate_switch]) the switch zone. +use crate::ledger::{Ledger, Ledgerable}; use crate::params::{ DendriteAsic, ServiceEnsureBody, ServiceType, ServiceZoneRequest, ServiceZoneService, TimeSync, ZoneType, @@ -61,6 +62,7 @@ use omicron_common::address::OXIMETER_PORT; use omicron_common::address::RACK_PREFIX; use omicron_common::address::SLED_PREFIX; use omicron_common::address::WICKETD_PORT; +use omicron_common::api::external::Generation; use omicron_common::backoff::{ retry_notify, retry_policy_internal_service_aggressive, retry_policy_local, BackoffError, @@ -106,6 +108,9 @@ pub enum Error { #[error("Failed to find device {device}")] MissingDevice { device: String }, + #[error("Failed to access ledger: {0}")] + Ledger(#[from] crate::ledger::Error), + #[error("Sled Agent not initialized yet")] SledAgentNotReady, @@ -198,11 +203,6 @@ pub struct Config { /// An optional internet gateway address for external services. pub gateway_address: Option, - - // The path for the ServiceManager to store information about - // all running services. - all_svcs_ledger_path: PathBuf, - storage_svcs_ledger_path: PathBuf, } impl Config { @@ -211,94 +211,35 @@ impl Config { sidecar_revision: String, gateway_address: Option, ) -> Self { - Self { - sled_id, - sidecar_revision, - gateway_address, - all_svcs_ledger_path: default_services_ledger_path(), - storage_svcs_ledger_path: default_storage_services_ledger_path(), - } + Self { sled_id, sidecar_revision, gateway_address } } } -// The filename of ServiceManager's internal storage. +// The filename of the ledger, within the provided directory. const SERVICES_CONFIG_FILENAME: &str = "services.toml"; const STORAGE_SERVICES_CONFIG_FILENAME: &str = "storage-services.toml"; -// The default path to service configuration -fn default_services_ledger_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(SERVICES_CONFIG_FILENAME) -} - -// The default path to storage service configuration -fn default_storage_services_ledger_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(STORAGE_SERVICES_CONFIG_FILENAME) -} - -// TODO(ideas): -// - "ServiceLedger" -// - Manages the serializable "AllZoneRequests" object -// - Constructor which reads from config location (kinda like -// "read_from") -// - ... Writer which *knows the type* to be serialized, so can direct it to the -// appropriate output path. -// -// - TODO: later: Can also make the path writing safer, by... -// - ... TODO: Writing to both M.2s, basically using multiple output paths -// - ... TODO: Using a temporary file and renaming it to make the update atomic -// - ... TODO: Add a .json EXPECTORATE test for the format of "AllZoneRequests" -// - we need to be careful not to break compatibility in the future. - // A wrapper around `ZoneRequest`, which allows it to be serialized // to a toml file. #[derive(Clone, serde::Serialize, serde::Deserialize)] struct AllZoneRequests { + generation: Generation, requests: Vec, } -impl AllZoneRequests { - fn new() -> Self { - Self { requests: vec![] } +impl Default for AllZoneRequests { + fn default() -> Self { + Self { generation: Generation::new(), requests: vec![] } } +} - // Reads from `path` as a toml-serialized version of `Self`. - async fn read_from(log: &Logger, path: &Path) -> Result { - if path.exists() { - debug!( - log, - "Reading old storage service requests from {}", - path.display() - ); - toml::from_str( - &tokio::fs::read_to_string(&path) - .await - .map_err(|err| Error::io_path(&path, err))?, - ) - .map_err(|err| Error::TomlDeserialize { - path: path.to_path_buf(), - err, - }) - } else { - debug!(log, "No old storage service requests"); - Ok(AllZoneRequests::new()) - } +impl Ledgerable for AllZoneRequests { + fn is_newer_than(&self, other: &AllZoneRequests) -> bool { + self.generation >= other.generation } - // Writes to `path` as a toml-serialized version of `Self`. - async fn write_to(&self, log: &Logger, path: &Path) -> Result<(), Error> { - debug!(log, "Writing zone request configuration to {}", path.display()); - let serialized_services = toml::Value::try_from(&self) - .expect("Cannot serialize service list"); - let services_str = - toml::to_string(&serialized_services).map_err(|err| { - Error::TomlSerialize { path: path.to_path_buf(), err } - })?; - tokio::fs::write(&path, services_str) - .await - .map_err(|err| Error::io_path(&path, err))?; - Ok(()) + fn generation_bump(&mut self) { + self.generation = self.generation.next(); } } @@ -377,8 +318,8 @@ pub struct ServiceManagerInner { // TODO(https://github.com/oxidecomputer/omicron/issues/2888): We will // need this interface to provision Zone filesystems on explicit U.2s, // rather than simply placing them on the ramdisk. - #[allow(dead_code)] storage: StorageManager, + ledger_directory_override: OnceCell, } // Late-binding information, only known once the sled agent is up and @@ -453,23 +394,62 @@ impl ServiceManager { sled_info: OnceCell::new(), switch_zone_bootstrap_address, storage, + ledger_directory_override: OnceCell::new(), }), }; Ok(mgr) } + #[cfg(test)] + async fn override_ledger_directory(&self, path: PathBuf) { + self.inner.ledger_directory_override.set(path).unwrap(); + } + pub fn switch_zone_bootstrap_address(&self) -> Ipv6Addr { self.inner.switch_zone_bootstrap_address } + async fn all_service_ledgers(&self) -> Vec { + if let Some(dir) = self.inner.ledger_directory_override.get() { + return vec![dir.join(SERVICES_CONFIG_FILENAME)]; + } + self.inner + .storage + .resources() + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(SERVICES_CONFIG_FILENAME)) + .collect() + } + + async fn all_storage_service_ledgers(&self) -> Vec { + if let Some(dir) = self.inner.ledger_directory_override.get() { + return vec![dir.join(STORAGE_SERVICES_CONFIG_FILENAME)]; + } + + self.inner + .storage + .resources() + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(STORAGE_SERVICES_CONFIG_FILENAME)) + .collect() + } + pub async fn load_non_storage_services(&self) -> Result<(), Error> { let log = &self.inner.log; - let ledger = self.services_ledger_path()?; - if !ledger.exists() { + let mut existing_zones = self.inner.zones.lock().await; + let ledger = Ledger::::new( + log, + self.all_service_ledgers().await, + ) + .await?; + let services = ledger.data(); + if services.requests.is_empty() { return Ok(()); } - let services = AllZoneRequests::read_from(log, &ledger).await?; - let mut existing_zones = self.inner.zones.lock().await; // Initialize and DNS and NTP services first as they are required // for time synchronization, which is a pre-requisite for the other @@ -537,12 +517,16 @@ impl ServiceManager { pub async fn load_storage_services(&self) -> Result<(), Error> { let log = &self.inner.log; - let ledger = self.storage_services_ledger_path()?; - if !ledger.exists() { + let mut existing_zones = self.inner.dataset_zones.lock().await; + let ledger = Ledger::::new( + log, + self.all_storage_service_ledgers().await, + ) + .await?; + let services = ledger.data(); + if services.requests.is_empty() { return Ok(()); } - let services = AllZoneRequests::read_from(log, &ledger).await?; - let mut existing_zones = self.inner.dataset_zones.lock().await; self.initialize_services_locked( &mut existing_zones, &services.requests, @@ -593,26 +577,6 @@ impl ServiceManager { self.inner.sled_mode } - // Returns either the path to the explicitly provided ledger path, or - // chooses the default one. - fn services_ledger_path(&self) -> Result { - if let Some(info) = self.inner.sled_info.get() { - Ok(info.config.all_svcs_ledger_path.clone()) - } else { - Err(Error::SledAgentNotReady) - } - } - - // Returns either the path to the explicitly provided ledger path, or - // chooses the default one. - fn storage_services_ledger_path(&self) -> Result { - if let Some(info) = self.inner.sled_info.get() { - Ok(info.config.storage_svcs_ledger_path.clone()) - } else { - Err(Error::SledAgentNotReady) - } - } - // Advertise the /64 prefix of `address`, unless we already have. // // This method only blocks long enough to check our HashSet of @@ -1814,15 +1778,20 @@ impl ServiceManager { &self, request: ServiceEnsureBody, ) -> Result<(), Error> { + let log = &self.inner.log; let mut existing_zones = self.inner.zones.lock().await; - let ledger_path = self.services_ledger_path()?; - let old_zone_requests = - AllZoneRequests::read_from(&self.inner.log, &ledger_path).await?; + // Read the existing set of services from the ledger. + let mut ledger = Ledger::::new( + log, + self.all_service_ledgers().await, + ) + .await?; + let ledger_zone_requests = ledger.data_mut(); let new_zone_requests: Vec = { let known_set: HashSet<&ServiceZoneRequest> = HashSet::from_iter( - old_zone_requests.requests.iter().map(|r| &r.zone), + ledger_zone_requests.requests.iter().map(|r| &r.zone), ); let requested_set = HashSet::from_iter(request.services.iter()); @@ -1834,7 +1803,7 @@ impl ServiceManager { // the case of changing configurations, rather than just doing // that removal implicitly. warn!( - self.inner.log, + log, "Cannot request services on this sled, differing configurations: {:#?}", known_set.symmetric_difference(&requested_set) ); @@ -1846,7 +1815,7 @@ impl ServiceManager { .collect::>() }; - let mut zone_requests = AllZoneRequests::new(); + let mut zone_requests = AllZoneRequests::default(); for zone in new_zone_requests.into_iter() { let root = PathBuf::from(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT); zone_requests.requests.push(ZoneRequest { zone, root }); @@ -1858,8 +1827,9 @@ impl ServiceManager { ) .await?; - zone_requests.requests.append(&mut old_zone_requests.requests.clone()); - zone_requests.write_to(&self.inner.log, &ledger_path).await?; + // Update the services in the ledger and write it back to both M.2s + ledger_zone_requests.requests.append(&mut zone_requests.requests); + ledger.commit().await?; Ok(()) } @@ -1873,13 +1843,18 @@ impl ServiceManager { &self, request: ServiceZoneRequest, ) -> Result<(), Error> { + let log = &self.inner.log; let mut existing_zones = self.inner.dataset_zones.lock().await; - let ledger_path = self.storage_services_ledger_path()?; - let mut zone_requests = - AllZoneRequests::read_from(&self.inner.log, &ledger_path).await?; + // Read the existing set of services from the ledger. + let mut ledger = Ledger::::new( + log, + self.all_storage_service_ledgers().await, + ) + .await?; + let ledger_zone_requests = ledger.data_mut(); - if !zone_requests + if !ledger_zone_requests .requests .iter() .any(|zone_request| zone_request.zone.id == request.id) @@ -1893,16 +1868,18 @@ impl ServiceManager { let root = dataset .pool() .dataset_mountpoint(sled_hardware::disk::ZONE_DATASET); - zone_requests.requests.push(ZoneRequest { zone: request, root }); + ledger_zone_requests + .requests + .push(ZoneRequest { zone: request, root }); } self.initialize_services_locked( &mut existing_zones, - &zone_requests.requests, + &ledger_zone_requests.requests, ) .await?; - zone_requests.write_to(&self.inner.log, &ledger_path).await?; + ledger.commit().await?; Ok(()) } @@ -2486,16 +2463,10 @@ mod test { } fn make_config(&self) -> Config { - let all_svcs_ledger_path = - self.config_dir.path().join(SERVICES_CONFIG_FILENAME); - let storage_svcs_ledger_path = - self.config_dir.path().join(STORAGE_SERVICES_CONFIG_FILENAME); Config { sled_id: Uuid::new_v4(), sidecar_revision: "rev_whatever_its_a_test".to_string(), gateway_address: None, - all_svcs_ledger_path, - storage_svcs_ledger_path, } } } @@ -2523,6 +2494,10 @@ mod test { ) .await .unwrap(); + mgr.override_ledger_directory( + test_config.config_dir.path().to_path_buf(), + ) + .await; let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2569,6 +2544,10 @@ mod test { ) .await .unwrap(); + mgr.override_ledger_directory( + test_config.config_dir.path().to_path_buf(), + ) + .await; let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2618,6 +2597,10 @@ mod test { ) .await .unwrap(); + mgr.override_ledger_directory( + test_config.config_dir.path().to_path_buf(), + ) + .await; let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2655,6 +2638,10 @@ mod test { ) .await .unwrap(); + mgr.override_ledger_directory( + test_config.config_dir.path().to_path_buf(), + ) + .await; let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2701,6 +2688,10 @@ mod test { ) .await .unwrap(); + mgr.override_ledger_directory( + test_config.config_dir.path().to_path_buf(), + ) + .await; let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), Ipv6Addr::new(0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01), @@ -2721,8 +2712,10 @@ mod test { // Next, delete the config. This means the service we just created will // not be remembered on the next initialization. - let config = test_config.make_config(); - std::fs::remove_file(&config.all_svcs_ledger_path).unwrap(); + std::fs::remove_file( + test_config.config_dir.path().join(SERVICES_CONFIG_FILENAME), + ) + .unwrap(); // Observe that the old service is not re-initialized. let mgr = ServiceManager::new( @@ -2740,6 +2733,11 @@ mod test { ) .await .unwrap(); + mgr.override_ledger_directory( + test_config.config_dir.path().to_path_buf(), + ) + .await; + let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), Ipv6Addr::new(0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01), diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 8e23fc89297..c2aed3ad968 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -225,6 +225,15 @@ impl StorageResources { self.all_zpools(DiskVariant::M2).await } + /// Returns all mountpoints within all M.2s for a particular dataset. + pub async fn all_m2_mountpoints(&self, dataset: &str) -> Vec { + let m2_zpools = self.all_m2_zpools().await; + m2_zpools + .iter() + .map(|zpool| zpool.dataset_mountpoint(dataset)) + .collect() + } + pub async fn all_zpools(&self, variant: DiskVariant) -> Vec { let disks = self.disks.lock().await; disks From c37e57ef76497d89602e00964bcb9b2eac3326ad Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 May 2023 11:59:41 -0400 Subject: [PATCH 09/39] Improve parsing for toml, openapi --- openapi/bootstrap-agent.json | 52 +++++++++++++++++++ sled-agent/src/bootstrap/params.rs | 2 +- sled-agent/src/rack_setup/service.rs | 2 +- .../gimlet-standalone/config-rss.toml | 2 +- smf/sled-agent/non-gimlet/config-rss.toml | 2 +- 5 files changed, 56 insertions(+), 4 deletions(-) diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 3264d090b37..68b1851345c 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -113,6 +113,49 @@ } }, "schemas": { + "BootstrapAddressDiscovery": { + "oneOf": [ + { + "description": "Ignore all bootstrap addresses except our own.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "only_ours" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Ignore all bootstrap addresses except the following.", + "type": "object", + "properties": { + "addrs": { + "type": "array", + "items": { + "type": "string", + "format": "ipv6" + }, + "uniqueItems": true + }, + "type": { + "type": "string", + "enum": [ + "only_these" + ] + } + }, + "required": [ + "addrs", + "type" + ] + } + ] + }, "Component": { "type": "object", "properties": { @@ -239,6 +282,14 @@ "description": "Configuration for the \"rack setup service\".\n\nThe Rack Setup Service should be responsible for one-time setup actions, such as CockroachDB placement and initialization. Without operator intervention, however, these actions need a way to be automated in our deployment.", "type": "object", "properties": { + "bootstrap_discovery": { + "description": "Describes how bootstrap addresses should be collected during RSS.", + "allOf": [ + { + "$ref": "#/components/schemas/BootstrapAddressDiscovery" + } + ] + }, "dns_servers": { "description": "The external DNS server addresses.", "type": "array", @@ -281,6 +332,7 @@ } }, "required": [ + "bootstrap_discovery", "dns_servers", "internal_services_ip_pool_ranges", "ntp_servers", diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 9c01c75af92..1dbc1824fc5 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -21,7 +21,7 @@ pub enum BootstrapAddressDiscovery { /// Ignore all bootstrap addresses except our own. OnlyOurs, /// Ignore all bootstrap addresses except the following. - OnlyThese(HashSet), + OnlyThese { addrs: HashSet }, } /// Configuration for the "rack setup service". diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index ba9310afe9e..7203e077a22 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -851,7 +851,7 @@ impl ServiceInner { BootstrapAddressDiscovery::OnlyOurs => { HashSet::from([local_bootstrap_agent.our_address()]) } - BootstrapAddressDiscovery::OnlyThese(peers) => peers.clone(), + BootstrapAddressDiscovery::OnlyThese { addrs } => addrs.clone(), }; let maybe_sled_plan = SledPlan::load(&self.log).await?; if let Some(plan) = &maybe_sled_plan { diff --git a/smf/sled-agent/gimlet-standalone/config-rss.toml b/smf/sled-agent/gimlet-standalone/config-rss.toml index 246f57848da..68f48d51d53 100644 --- a/smf/sled-agent/gimlet-standalone/config-rss.toml +++ b/smf/sled-agent/gimlet-standalone/config-rss.toml @@ -7,7 +7,7 @@ rack_subnet = "fd00:1122:3344:0100::" # Only include "our own sled" in the bootstrap network -bootstrap_discovery = "only_ours" +bootstrap_discovery.type = "only_ours" # The number of sleds required to unlock the rack secret. # diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 246f57848da..68f48d51d53 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -7,7 +7,7 @@ rack_subnet = "fd00:1122:3344:0100::" # Only include "our own sled" in the bootstrap network -bootstrap_discovery = "only_ours" +bootstrap_discovery.type = "only_ours" # The number of sleds required to unlock the rack secret. # From 541f68dee6cdf9be61f2a0c71475ccbdccd65def Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 May 2023 12:29:53 -0400 Subject: [PATCH 10/39] Remove the comments about the ledger, we do that in #2972 --- sled-agent/src/services.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index d8366c85f6a..7584729e8fe 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -237,20 +237,6 @@ fn default_storage_services_ledger_path() -> PathBuf { .join(STORAGE_SERVICES_CONFIG_FILENAME) } -// TODO(ideas): -// - "ServiceLedger" -// - Manages the serializable "AllZoneRequests" object -// - Constructor which reads from config location (kinda like -// "read_from") -// - ... Writer which *knows the type* to be serialized, so can direct it to the -// appropriate output path. -// -// - TODO: later: Can also make the path writing safer, by... -// - ... TODO: Writing to both M.2s, basically using multiple output paths -// - ... TODO: Using a temporary file and renaming it to make the update atomic -// - ... TODO: Add a .json EXPECTORATE test for the format of "AllZoneRequests" -// - we need to be careful not to break compatibility in the future. - // A wrapper around `ZoneRequest`, which allows it to be serialized // to a toml file. #[derive(Clone, serde::Serialize, serde::Deserialize)] From 5d599516d4e4504bbae00347dcddbeea8f17146a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 May 2023 12:30:41 -0400 Subject: [PATCH 11/39] configs -> ledgers --- sled-agent/src/services.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 7584729e8fe..203650e8f8b 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -222,19 +222,19 @@ impl Config { } // The filename of ServiceManager's internal storage. -const SERVICES_CONFIG_FILENAME: &str = "services.toml"; -const STORAGE_SERVICES_CONFIG_FILENAME: &str = "storage-services.toml"; +const SERVICES_LEDGER_FILENAME: &str = "services.toml"; +const STORAGE_SERVICES_LEDGER_FILENAME: &str = "storage-services.toml"; // The default path to service configuration fn default_services_ledger_path() -> PathBuf { Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(SERVICES_CONFIG_FILENAME) + .join(SERVICES_LEDGER_FILENAME) } // The default path to storage service configuration fn default_storage_services_ledger_path() -> PathBuf { Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join(STORAGE_SERVICES_CONFIG_FILENAME) + .join(STORAGE_SERVICES_LEDGER_FILENAME) } // A wrapper around `ZoneRequest`, which allows it to be serialized @@ -1358,13 +1358,13 @@ impl ServiceManager { )); // The filename of a half-completed config, in need of parameters supplied at // runtime. - const PARTIAL_CONFIG_FILENAME: &str = "config-partial.toml"; + const PARTIAL_LEDGER_FILENAME: &str = "config-partial.toml"; // The filename of a completed config, merging the partial config with // additional appended parameters known at runtime. - const COMPLETE_CONFIG_FILENAME: &str = "config.toml"; + const COMPLETE_LEDGER_FILENAME: &str = "config.toml"; let partial_config_path = - config_dir.join(PARTIAL_CONFIG_FILENAME); - let config_path = config_dir.join(COMPLETE_CONFIG_FILENAME); + config_dir.join(PARTIAL_LEDGER_FILENAME); + let config_path = config_dir.join(COMPLETE_LEDGER_FILENAME); tokio::fs::copy(partial_config_path, &config_path) .await .map_err(|err| Error::io_path(&config_path, err))?; @@ -2473,9 +2473,9 @@ mod test { fn make_config(&self) -> Config { let all_svcs_ledger_path = - self.config_dir.path().join(SERVICES_CONFIG_FILENAME); + self.config_dir.path().join(SERVICES_LEDGER_FILENAME); let storage_svcs_ledger_path = - self.config_dir.path().join(STORAGE_SERVICES_CONFIG_FILENAME); + self.config_dir.path().join(STORAGE_SERVICES_LEDGER_FILENAME); Config { sled_id: Uuid::new_v4(), sidecar_revision: "rev_whatever_its_a_test".to_string(), From ed20fffa842f9f048373d59cff5fe157794b1e6f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 May 2023 12:49:10 -0400 Subject: [PATCH 12/39] review feedback --- sled-agent/src/services.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 203650e8f8b..42119cb72d5 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -563,7 +563,8 @@ impl ServiceManager { .expect("Sled Agent should only start once"); self.load_non_storage_services().await?; - // TODO: These will fail if the disks aren't attached. + // TODO(https://github.com/oxidecomputer/omicron/issues/2973): + // These will fail if the disks aren't attached. // Should we have a retry loop here? Kinda like we have with the switch // / NTP zone? // From 0ae55318b9756c5ae3ee37ea1b7427c2d97c8a89 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 May 2023 17:16:23 -0400 Subject: [PATCH 13/39] We should allow synthetic disks to be used as M2s --- sled-agent/src/storage_manager.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index c2aed3ad968..0c58405e6d0 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -239,11 +239,9 @@ impl StorageResources { disks .values() .filter_map(|disk| { - if let DiskWrapper::Real { disk, .. } = disk { - if disk.variant() == variant { - return Some(disk.zpool_name().clone()); - } - }; + if disk.variant() == variant { + return Some(disk.zpool_name().clone()); + } None }) .collect() From e346367c0308088e09900a73bf04fbddc3362ee0 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 May 2023 16:40:16 -0400 Subject: [PATCH 14/39] [sled-agent and friends] std::path to camino --- Cargo.lock | 7 ++ illumos-utils/Cargo.toml | 1 + illumos-utils/src/fstyp.rs | 4 +- illumos-utils/src/opte/illumos.rs | 4 +- illumos-utils/src/running_zone.rs | 19 +++-- illumos-utils/src/zfs.rs | 6 +- illumos-utils/src/zone.rs | 6 +- illumos-utils/src/zpool.rs | 8 +- installinator/src/hardware.rs | 2 +- installinator/src/write.rs | 13 ++- nexus/Cargo.toml | 1 + nexus/src/app/sagas/instance_migrate.rs | 3 +- nexus/test-utils/Cargo.toml | 2 + nexus/test-utils/src/lib.rs | 14 ++-- nexus/tests/integration_tests/instances.rs | 3 +- nexus/tests/integration_tests/sleds.rs | 3 +- sled-agent/Cargo.toml | 2 + sled-agent/src/bin/sled-agent-sim.rs | 4 +- sled-agent/src/bin/sled-agent.rs | 4 +- sled-agent/src/bootstrap/agent.rs | 10 +-- sled-agent/src/bootstrap/params.rs | 4 +- sled-agent/src/config.rs | 20 ++--- sled-agent/src/instance.rs | 2 +- sled-agent/src/ledger.rs | 52 ++++++------ sled-agent/src/profile.rs | 2 +- sled-agent/src/rack_setup/config.rs | 4 +- sled-agent/src/rack_setup/plan/service.rs | 9 +- sled-agent/src/rack_setup/plan/sled.rs | 9 +- sled-agent/src/rack_setup/service.rs | 6 +- sled-agent/src/services.rs | 36 ++++---- sled-agent/src/sled_agent.rs | 7 +- sled-agent/src/sp/mod.rs | 4 +- sled-agent/src/storage_manager.rs | 14 ++-- sled-agent/src/updates.rs | 60 +++++++------ .../tests/integration_tests/commands.rs | 10 ++- sled-hardware/Cargo.toml | 1 + sled-hardware/src/disk.rs | 84 ++++++++++--------- sled-hardware/src/illumos/mod.rs | 15 ++-- sled-hardware/src/illumos/partitions.rs | 65 +++++++------- 39 files changed, 281 insertions(+), 239 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bd07b4a50fa..6412374c020 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3126,6 +3126,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "camino", "cfg-if 1.0.0", "futures", "ipnetwork", @@ -4027,6 +4028,8 @@ version = "0.1.0" dependencies = [ "anyhow", "bytes", + "camino", + "camino-tempfile", "chrono", "crucible-agent-client", "dns-server", @@ -4439,6 +4442,7 @@ dependencies = [ "authz-macros", "base64 0.21.0", "bb8", + "camino", "chrono", "clap 4.2.5", "cookie", @@ -4580,6 +4584,8 @@ dependencies = [ "bincode", "bootstrap-agent-client", "bytes", + "camino", + "camino-tempfile", "cfg-if 1.0.0", "chrono", "clap 4.2.5", @@ -7129,6 +7135,7 @@ name = "sled-hardware" version = "0.1.0" dependencies = [ "anyhow", + "camino", "cfg-if 1.0.0", "futures", "illumos-devinfo", diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml index c3c4270eba7..b3c619d01ca 100644 --- a/illumos-utils/Cargo.toml +++ b/illumos-utils/Cargo.toml @@ -8,6 +8,7 @@ license = "MPL-2.0" [dependencies] anyhow.workspace = true async-trait.workspace = true +camino.workspace = true cfg-if.workspace = true futures.workspace = true ipnetwork.workspace = true diff --git a/illumos-utils/src/fstyp.rs b/illumos-utils/src/fstyp.rs index db256b5c845..dbbe3442dcc 100644 --- a/illumos-utils/src/fstyp.rs +++ b/illumos-utils/src/fstyp.rs @@ -6,7 +6,7 @@ use crate::zpool::ZpoolName; use crate::{execute, PFEXEC}; -use std::path::Path; +use camino::Utf8Path; use std::str::FromStr; const FSTYP: &str = "/usr/sbin/fstyp"; @@ -33,7 +33,7 @@ pub struct Fstyp {} impl Fstyp { /// Executes the 'fstyp' command and parses the name of a zpool from it, if /// one exists. - pub fn get_zpool(path: &Path) -> Result { + pub fn get_zpool(path: &Utf8Path) -> Result { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear(); cmd.env("LC_ALL", "C.UTF-8"); diff --git a/illumos-utils/src/opte/illumos.rs b/illumos-utils/src/opte/illumos.rs index 0dd262348f7..88e8d343b14 100644 --- a/illumos-utils/src/opte/illumos.rs +++ b/illumos-utils/src/opte/illumos.rs @@ -6,11 +6,11 @@ use crate::addrobj::AddrObject; use crate::dladm; +use camino::Utf8Path; use omicron_common::api::internal::shared::NetworkInterfaceKind; use opte_ioctl::OpteHdl; use slog::info; use slog::Logger; -use std::path::Path; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -72,7 +72,7 @@ pub fn initialize_xde_driver( underlay_nics: &[AddrObject], ) -> Result<(), Error> { const XDE_CONF: &str = "/kernel/drv/xde.conf"; - let xde_conf = Path::new(XDE_CONF); + let xde_conf = Utf8Path::new(XDE_CONF); if !xde_conf.exists() { return Err(Error::NoXdeConf); } diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index f26aa82721b..31c7d0a9729 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -10,6 +10,7 @@ use crate::link::{Link, VnicAllocator}; use crate::opte::{Port, PortTicket}; use crate::svc::wait_for_service; use crate::zone::{AddressRequest, ZONE_PREFIX}; +use camino::{Utf8Path, Utf8PathBuf}; use ipnetwork::IpNetwork; use omicron_common::backoff; use slog::info; @@ -17,7 +18,6 @@ use slog::o; use slog::warn; use slog::Logger; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; -use std::path::{Path, PathBuf}; #[cfg(any(test, feature = "testing"))] use crate::zone::MockZones as Zones; @@ -90,6 +90,9 @@ pub enum GetZoneError { err: crate::zone::AdmError, }, + #[error("Invalid Utf8 path: {0}")] + FromPathBuf(#[from] camino::FromPathBufError), + #[error("Zone with prefix '{prefix}' not found")] NotFound { prefix: String }, @@ -144,7 +147,7 @@ impl RunningZone { /// Returns the filesystem path to the zone's root pub fn root(&self) -> String { - format!("{}/root", self.inner.zonepath.display()) + format!("{}/root", self.inner.zonepath) } /// Runs a command within the Zone, return the output. @@ -476,7 +479,7 @@ impl RunningZone { running: true, inner: InstalledZone { log: log.new(o!("zone" => zone_name.to_string())), - zonepath: zone_info.path().into(), + zonepath: zone_info.path().to_path_buf().try_into()?, name: zone_name.to_string(), control_vnic, // TODO(https://github.com/oxidecomputer/omicron/issues/725) @@ -553,7 +556,7 @@ pub enum InstallZoneError { #[error("Failed to install zone '{zone}' from '{image_path}': {err}")] InstallZone { zone: String, - image_path: PathBuf, + image_path: Utf8PathBuf, #[source] err: crate::zone::AdmError, }, @@ -563,7 +566,7 @@ pub struct InstalledZone { log: Logger, // Filesystem path of the zone - zonepath: PathBuf, + zonepath: Utf8PathBuf, // Name of the Zone. name: String, @@ -609,7 +612,7 @@ impl InstalledZone { } /// Returns the filesystem path to the zonepath - pub fn zonepath(&self) -> &Path { + pub fn zonepath(&self) -> &Utf8Path { &self.zonepath } @@ -617,7 +620,7 @@ impl InstalledZone { pub async fn install( log: &Logger, underlay_vnic_allocator: &VnicAllocator, - zone_root_path: &Path, + zone_root_path: &Utf8Path, zone_type: &str, unique_name: Option<&str>, datasets: &[zone::Dataset], @@ -638,7 +641,7 @@ impl InstalledZone { let full_zone_name = Self::get_zone_name(zone_type, unique_name); let zone_image_path = - PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type)); + Utf8PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type)); let net_device_names: Vec = opte_ports .iter() diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index 19961b43d1a..aaf9865232d 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -5,8 +5,8 @@ //! Utilities for poking at ZFS. use crate::{execute, PFEXEC}; +use camino::Utf8PathBuf; use std::fmt; -use std::path::PathBuf; pub const ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT: &str = "/zone"; pub const ZONE_ZFS_RAMDISK_DATASET: &str = "rpool/zone"; @@ -92,14 +92,14 @@ pub struct Zfs {} pub enum Mountpoint { #[allow(dead_code)] Legacy, - Path(PathBuf), + Path(Utf8PathBuf), } impl fmt::Display for Mountpoint { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Mountpoint::Legacy => write!(f, "legacy"), - Mountpoint::Path(p) => write!(f, "{}", p.display()), + Mountpoint::Path(p) => write!(f, "{p}"), } } } diff --git a/illumos-utils/src/zone.rs b/illumos-utils/src/zone.rs index 2c431c0fa84..426abed6fff 100644 --- a/illumos-utils/src/zone.rs +++ b/illumos-utils/src/zone.rs @@ -5,12 +5,12 @@ //! API for interacting with Zones running Propolis. use anyhow::anyhow; +use camino::Utf8Path; use ipnetwork::IpNetwork; use ipnetwork::IpNetworkError; use slog::info; use slog::Logger; use std::net::{IpAddr, Ipv6Addr}; -use std::path::Path; use crate::addrobj::AddrObject; use crate::dladm::{EtherstubVnic, VNIC_PREFIX_BOOTSTRAP, VNIC_PREFIX_CONTROL}; @@ -281,9 +281,9 @@ impl Zones { #[allow(clippy::too_many_arguments)] pub async fn install_omicron_zone( log: &Logger, - zone_root_path: &Path, + zone_root_path: &Utf8Path, zone_name: &str, - zone_image: &std::path::Path, + zone_image: &Utf8Path, datasets: &[zone::Dataset], filesystems: &[zone::Fs], devices: &[zone::Device], diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index dc4507f0ea8..1528c44fdbf 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -5,10 +5,10 @@ //! Utilities for managing Zpools. use crate::{execute, ExecutionError, PFEXEC}; +use camino::{Utf8Path, Utf8PathBuf}; use schemars::JsonSchema; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::fmt; -use std::path::{Path, PathBuf}; use std::str::FromStr; use uuid::Uuid; @@ -167,7 +167,7 @@ pub struct Zpool {} #[cfg_attr(any(test, feature = "testing"), mockall::automock, allow(dead_code))] impl Zpool { - pub fn create(name: ZpoolName, vdev: &Path) -> Result<(), CreateError> { + pub fn create(name: ZpoolName, vdev: &Utf8Path) -> Result<(), CreateError> { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear(); cmd.env("LC_ALL", "C.UTF-8"); @@ -287,8 +287,8 @@ impl ZpoolName { /// Returns a path to a dataset's mountpoint within the zpool. /// /// For example: oxp_(UUID) -> /pool/ext/(UUID)/(dataset) - pub fn dataset_mountpoint(&self, dataset: &str) -> PathBuf { - let mut path = PathBuf::new(); + pub fn dataset_mountpoint(&self, dataset: &str) -> Utf8PathBuf { + let mut path = Utf8PathBuf::new(); path.push("/pool"); match self.kind { ZpoolKind::External => path.push("ext"), diff --git a/installinator/src/hardware.rs b/installinator/src/hardware.rs index c3dabd62ab9..91c3ea4ab28 100644 --- a/installinator/src/hardware.rs +++ b/installinator/src/hardware.rs @@ -45,7 +45,7 @@ impl Hardware { DiskVariant::U2 => { info!( log, "ignoring U.2 disk"; - "path" => disk.devfs_path().display(), + "path" => disk.devfs_path().as_str(), ); return None; } diff --git a/installinator/src/write.rs b/installinator/src/write.rs index f477ae16848..f223e37a8be 100644 --- a/installinator/src/write.rs +++ b/installinator/src/write.rs @@ -96,9 +96,9 @@ impl WriteDestination { info!( log, "found target M.2 disk"; "identity" => ?disk.identity(), - "path" => disk.devfs_path().display(), + "path" => disk.devfs_path().as_str(), "slot" => disk.slot(), - "boot_image_path" => path.display(), + "boot_image_path" => path.as_str(), "zpool" => %disk.zpool_name(), ); @@ -106,8 +106,7 @@ impl WriteDestination { Entry::Vacant(entry) => { entry.insert(ArtifactDestination { create_host_phase_2: false, - host_phase_2: Utf8PathBuf::try_from(path) - .context("non-UTF8 drive path")?, + host_phase_2: path, // TODO-completeness Fix this once we know how // to write the control plane image to this // disk's zpool. @@ -118,9 +117,9 @@ impl WriteDestination { warn!( log, "skipping duplicate M.2 drive entry"; "identity" => ?disk.identity(), - "path" => disk.devfs_path().display(), + "path" => disk.devfs_path().as_str(), "slot" => disk.slot(), - "boot_image_path" => path.display(), + "boot_image_path" => path.as_str(), "zpool" => %disk.zpool_name(), ); continue; @@ -131,7 +130,7 @@ impl WriteDestination { warn!( log, "found M.2 disk but failed to find boot image path"; "identity" => ?disk.identity(), - "path" => disk.devfs_path().display(), + "path" => disk.devfs_path().as_str(), "slot" => disk.slot(), "boot_image_path_err" => %err, "zpool" => %disk.zpool_name(), diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index fd039798a03..b60733bae3e 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -13,6 +13,7 @@ async-bb8-diesel.workspace = true async-trait.workspace = true base64.workspace = true bb8.workspace = true +camino.workspace = true clap.workspace = true chrono.workspace = true cookie.workspace = true diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index b6df0554546..7d564ebe321 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -475,6 +475,7 @@ mod tests { app::{saga::create_saga_dag, sagas::instance_create}, Nexus, TestInterfaces as _, }; + use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use http::{method::Method, StatusCode}; @@ -518,7 +519,7 @@ mod tests { cptestctx.server.get_http_server_internal_address().await; info!(&cptestctx.logctx.log, "Adding simulated sled"; "sled_id" => %sa_id); - let update_dir = std::path::Path::new("/should/be/unused"); + let update_dir = Utf8Path::new("/should/be/unused"); let sa = start_sled_agent( log, addr, diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 72059c6ba17..b1702722b5e 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -7,6 +7,8 @@ license = "MPL-2.0" [dependencies] anyhow.workspace = true bytes.workspace = true +camino.workspace = true +camino-tempfile.workspace = true chrono.workspace = true crucible-agent-client.workspace = true dns-server.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 590d10e5049..503305618aa 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -5,6 +5,7 @@ //! Integration testing facilities for Nexus use anyhow::Context; +use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use dropshot::test_util::LogContext; use dropshot::ConfigDropshot; @@ -24,7 +25,6 @@ use slog::o; use slog::Logger; use std::fmt::Debug; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; -use std::path::Path; use std::time::Duration; use trust_dns_resolver::config::NameServerConfig; use trust_dns_resolver::config::Protocol; @@ -54,7 +54,7 @@ pub struct ControlPlaneTestContext { pub database: dev::db::CockroachInstance, pub clickhouse: dev::clickhouse::ClickHouseInstance, pub logctx: LogContext, - pub sled_agent_storage: tempfile::TempDir, + pub sled_agent_storage: camino_tempfile::Utf8TempDir, pub sled_agent: sim::Server, pub oximeter: Oximeter, pub producer: ProducerServer, @@ -125,7 +125,7 @@ pub fn load_test_config() -> omicron_common::nexus_config::Config { // change the logging level and local IP if they want, and as we add more // configuration options, we expect many of those can be usefully configured // (and reconfigured) for the test suite. - let config_file_path = Path::new("tests/config.test.toml"); + let config_file_path = Utf8Path::new("tests/config.test.toml"); let mut config = omicron_common::nexus_config::Config::from_file(config_file_path) .expect("failed to load config.test.toml"); @@ -182,7 +182,7 @@ pub async fn test_setup_with_config( // Set up a single sled agent. let sa_id = Uuid::parse_str(SLED_AGENT_UUID).unwrap(); - let tempdir = tempfile::tempdir().unwrap(); + let tempdir = camino_tempfile::tempdir().unwrap(); let sled_agent = start_sled_agent( logctx.log.new(o!( "component" => "omicron_sled_agent::sim::Server", @@ -357,7 +357,7 @@ pub async fn start_sled_agent( log: Logger, nexus_address: SocketAddr, id: Uuid, - update_directory: &Path, + update_directory: &Utf8Path, sim_mode: sim::SimMode, ) -> Result { let config = sim::Config { @@ -522,7 +522,7 @@ pub fn assert_same_items(v1: Vec, v2: Vec) { pub async fn start_dns_server( log: slog::Logger, - storage_path: &Path, + storage_path: &Utf8Path, ) -> Result< ( dns_server::dns_server::ServerHandle, @@ -533,7 +533,7 @@ pub async fn start_dns_server( > { let config_store = dns_server::storage::Config { keep_old_generations: 3, - storage_path: storage_path.to_string_lossy().into_owned().into(), + storage_path: storage_path.into(), }; let store = dns_server::storage::Store::new( log.new(o!("component" => "DnsStore")), diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index d5b3e17d345..cea2f7e533a 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -6,6 +6,7 @@ use super::metrics::query_for_latest_metric; +use camino::Utf8Path; use chrono::Utc; use http::method::Method; use http::StatusCode; @@ -3055,7 +3056,7 @@ async fn test_instance_v2p_mappings(cptestctx: &ControlPlaneTestContext) { let log = cptestctx.logctx.log.new(o!( "sled_id" => sa_id.to_string() )); let addr = cptestctx.server.get_http_server_internal_address().await; - let update_directory = std::path::Path::new("/should/not/be/used"); + let update_directory = Utf8Path::new("/should/not/be/used"); additional_sleds.push( start_sled_agent( log, diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index 89e74cbd50d..eb9048b3fde 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -4,6 +4,7 @@ //! Tests for APIs against sled-based endpoints. +use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::create_physical_disk; @@ -50,7 +51,7 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { let log = cptestctx.logctx.log.new(o!( "sled_id" => sa_id.to_string() )); let addr = cptestctx.server.get_http_server_internal_address().await; - let update_directory = std::path::Path::new("/should/not/be/used"); + let update_directory = Utf8Path::new("/should/not/be/used"); sas.push( start_sled_agent( log, diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index e478dade602..7d1d6917c2d 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -12,6 +12,8 @@ base64.workspace = true bincode.workspace = true bootstrap-agent-client.workspace = true bytes.workspace = true +camino.workspace = true +camino-tempfile.workspace = true cfg-if.workspace = true chrono.workspace = true clap.workspace = true diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index 006f297ab57..5a07c46d2e4 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -74,8 +74,8 @@ async fn main() { async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); - let tmp = - tempfile::tempdir().map_err(|e| CmdError::Failure(e.to_string()))?; + let tmp = camino_tempfile::tempdir() + .map_err(|e| CmdError::Failure(e.to_string()))?; let config = Config { id: args.uuid, sim_mode: args.sim_mode, diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index 6ede353b962..b5be303d367 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -4,6 +4,7 @@ //! Executable program to run the sled agent +use camino::Utf8PathBuf; use clap::{Parser, Subcommand}; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; @@ -13,7 +14,6 @@ use omicron_sled_agent::bootstrap::{ use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig; use omicron_sled_agent::sp::SimSpConfig; use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; -use std::path::PathBuf; use uuid::Uuid; #[derive(Subcommand, Debug)] @@ -38,7 +38,7 @@ enum Args { /// Runs the Sled Agent server. Run { #[clap(name = "CONFIG_FILE_PATH", action)] - config_path: PathBuf, + config_path: Utf8PathBuf, }, } diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 350abc4ef5e..4682efd5f3f 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -24,6 +24,7 @@ use crate::services::ServiceManager; use crate::sp::SpHandle; use crate::storage_manager::StorageManager; use crate::updates::UpdateManager; +use camino::{Utf8Path, Utf8PathBuf}; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use futures::stream::{self, StreamExt, TryStreamExt}; use illumos_utils::addrobj::AddrObject; @@ -46,7 +47,6 @@ use slog::Logger; use std::borrow::Cow; use std::collections::HashSet; use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; -use std::path::{Path, PathBuf}; use std::sync::Arc; use thiserror::Error; use tokio::sync::Mutex; @@ -77,7 +77,7 @@ pub enum BootstrapError { SledError(String), #[error("Error deserializing toml from {path}: {err}")] - Toml { path: PathBuf, err: toml::de::Error }, + Toml { path: Utf8PathBuf, err: toml::de::Error }, #[error(transparent)] TrustQuorum(#[from] TrustQuorumError), @@ -201,8 +201,8 @@ pub struct Agent { global_zone_bootstrap_link_local_address: Ipv6Addr, } -fn get_sled_agent_request_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH) +fn get_sled_agent_request_path() -> Utf8PathBuf { + Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) .join("sled-agent-request.toml") } @@ -342,7 +342,7 @@ impl Agent { let do_format = true; Zfs::ensure_filesystem( ZONE_ZFS_RAMDISK_DATASET, - Mountpoint::Path(std::path::PathBuf::from( + Mountpoint::Path(Utf8PathBuf::from( ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT, )), zoned, diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 1dbc1824fc5..ef1c7711436 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -185,13 +185,13 @@ mod tests { use super::*; use crate::bootstrap::trust_quorum::RackSecret; use crate::bootstrap::trust_quorum::ShareDistribution; - use std::path::PathBuf; + use camino::Utf8PathBuf; #[test] fn parse_rack_initialization() { let manifest = std::env::var("CARGO_MANIFEST_DIR") .expect("Cannot access manifest directory"); - let manifest = PathBuf::from(manifest); + let manifest = Utf8PathBuf::from(manifest); let path = manifest.join("../smf/sled-agent/non-gimlet/config-rss.toml"); diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index d2508580f9f..e5bb8aa1ba6 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -5,6 +5,7 @@ //! Interfaces for working with sled agent configuration use crate::updates::ConfigUpdates; +use camino::{Utf8Path, Utf8PathBuf}; use dropshot::ConfigLogging; use illumos_utils::dladm::Dladm; use illumos_utils::dladm::FindPhysicalLinkError; @@ -14,7 +15,6 @@ use illumos_utils::zpool::ZpoolName; use omicron_common::vlan::VlanID; use serde::Deserialize; use sled_hardware::is_gimlet; -use std::path::{Path, PathBuf}; #[derive(Clone, Debug, Deserialize)] #[serde(rename_all = "lowercase")] @@ -69,13 +69,13 @@ pub struct Config { pub enum ConfigError { #[error("Failed to read config from {path}: {err}")] Io { - path: PathBuf, + path: Utf8PathBuf, #[source] err: std::io::Error, }, #[error("Failed to parse config from {path}: {err}")] Parse { - path: PathBuf, + path: Utf8PathBuf, #[source] err: toml::de::Error, }, @@ -86,7 +86,7 @@ pub enum ConfigError { } impl Config { - pub fn from_file>(path: P) -> Result { + pub fn from_file>(path: P) -> Result { let path = path.as_ref(); let contents = std::fs::read_to_string(&path) .map_err(|err| ConfigError::Io { path: path.into(), err })?; @@ -124,7 +124,7 @@ mod test { fn test_smf_configs() { let manifest = std::env::var("CARGO_MANIFEST_DIR") .expect("Cannot access manifest directory"); - let smf = PathBuf::from(manifest).join("../smf/sled-agent"); + let smf = Utf8PathBuf::from(manifest).join("../smf/sled-agent"); let mut configs_seen = 0; for variant in std::fs::read_dir(smf).unwrap() { @@ -133,11 +133,11 @@ mod test { for entry in std::fs::read_dir(variant.path()).unwrap() { let entry = entry.unwrap(); if entry.file_name() == "config.toml" { - Config::from_file(entry.path()).unwrap_or_else(|_| { - panic!( - "Failed to parse config {}", - entry.path().display() - ) + let path = entry.path(); + let utf8_path: &Utf8Path = + path.as_path().try_into().unwrap(); + Config::from_file(&utf8_path).unwrap_or_else(|_| { + panic!("Failed to parse config {utf8_path}") }); configs_seen += 1; } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 0f0977f8fab..6c0deb2b716 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -767,7 +767,7 @@ impl Instance { // Create a zone for the propolis instance, using the previously // configured VNICs. let zname = propolis_zone_name(inner.propolis_id()); - let root = std::path::Path::new(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT); + let root = camino::Utf8Path::new(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT); let installed_zone = InstalledZone::install( &inner.log, &inner.vnic_allocator, diff --git a/sled-agent/src/ledger.rs b/sled-agent/src/ledger.rs index 5462630b553..2d4cfd15270 100644 --- a/sled-agent/src/ledger.rs +++ b/sled-agent/src/ledger.rs @@ -5,17 +5,17 @@ //! Utilities to help reading/writing toml files from/to multiple paths use async_trait::async_trait; +use camino::{Utf8Path, Utf8PathBuf}; use serde::{de::DeserializeOwned, Serialize}; use slog::Logger; -use std::path::{Path, PathBuf}; #[derive(thiserror::Error, Debug)] pub enum Error { #[error("Cannot serialize TOML to file {path}: {err}")] - TomlSerialize { path: PathBuf, err: toml::ser::Error }, + TomlSerialize { path: Utf8PathBuf, err: toml::ser::Error }, #[error("Cannot deserialize TOML from file {path}: {err}")] - TomlDeserialize { path: PathBuf, err: toml::de::Error }, + TomlDeserialize { path: Utf8PathBuf, err: toml::de::Error }, #[error("Failed to perform I/O: {message}: {err}")] Io { @@ -29,8 +29,8 @@ pub enum Error { } impl Error { - fn io_path(path: &Path, err: std::io::Error) -> Self { - Self::Io { message: format!("Error accessing {}", path.display()), err } + fn io_path(path: &Utf8Path, err: std::io::Error) -> Self { + Self::Io { message: format!("Error accessing {path}"), err } } } @@ -54,7 +54,7 @@ impl From for omicron_common::api::external::Error { pub struct Ledger { log: Logger, ledger: T, - paths: Vec, + paths: Vec, } impl Ledger { @@ -63,7 +63,10 @@ impl Ledger { /// Returns the following, in order: /// - The ledger with the highest generation number /// - If none exists, returns a default ledger - pub async fn new(log: &Logger, paths: Vec) -> Result { + pub async fn new( + log: &Logger, + paths: Vec, + ) -> Result { // Read all the ledgers that we can. let mut ledgers = vec![]; for path in paths.iter() { @@ -105,7 +108,7 @@ impl Ledger { let mut one_successful_write = false; for path in self.paths.iter() { if let Err(e) = self.atomic_write(&path).await { - warn!(self.log, "Failed to write to {}: {e}", path.display()); + warn!(self.log, "Failed to write to {path}: {e}"); } else { one_successful_write = true; } @@ -121,14 +124,11 @@ impl Ledger { // // We accomplish this by first writing to a temporary file, then // renaming to the target location. - async fn atomic_write(&self, path: &Path) -> Result<(), Error> { + async fn atomic_write(&self, path: &Utf8Path) -> Result<(), Error> { let mut tmp_path = path.to_path_buf(); let tmp_filename = format!( ".{}.tmp", - tmp_path - .file_name() - .expect("Should have file name") - .to_string_lossy() + tmp_path.file_name().expect("Should have file name") ); tmp_path.set_file_name(tmp_filename); @@ -153,9 +153,9 @@ pub trait Ledgerable: fn generation_bump(&mut self); /// Reads from `path` as a toml-serialized version of `Self`. - async fn read_from(log: &Logger, path: &Path) -> Result { + async fn read_from(log: &Logger, path: &Utf8Path) -> Result { if path.exists() { - debug!(log, "Reading ledger from {}", path.display()); + debug!(log, "Reading ledger from {path}"); toml::from_str( &tokio::fs::read_to_string(&path) .await @@ -166,14 +166,18 @@ pub trait Ledgerable: err, }) } else { - debug!(log, "No ledger in {}", path.display()); + debug!(log, "No ledger in {path}"); Ok(Self::default()) } } /// Writes to `path` as a toml-serialized version of `Self`. - async fn write_to(&self, log: &Logger, path: &Path) -> Result<(), Error> { - debug!(log, "Writing ledger to {}", path.display()); + async fn write_to( + &self, + log: &Logger, + path: &Utf8Path, + ) -> Result<(), Error> { + debug!(log, "Writing ledger to {path}"); let serialized = toml::Value::try_from(&self).expect("Cannot serialize ledger"); let as_str = toml::to_string(&serialized).map_err(|err| { @@ -212,7 +216,7 @@ mod test { let logctx = test_setup_log("create_default_ledger"); let log = &logctx.log; - let config_dir = tempfile::TempDir::new().unwrap(); + let config_dir = camino_tempfile::Utf8TempDir::new().unwrap(); let ledger = Ledger::::new(&log, vec![config_dir.path().to_path_buf()]) .await @@ -230,7 +234,7 @@ mod test { let logctx = test_setup_log("create_ledger_reads_from_storage"); let log = &logctx.log; - let config_dir = tempfile::TempDir::new().unwrap(); + let config_dir = camino_tempfile::Utf8TempDir::new().unwrap(); let config_path = config_dir.path().join("ledger.toml"); // Create the ledger within a configuration directory @@ -261,8 +265,8 @@ mod test { // Create the ledger, initialize contents. let config_dirs = vec![ - tempfile::TempDir::new().unwrap(), - tempfile::TempDir::new().unwrap(), + camino_tempfile::Utf8TempDir::new().unwrap(), + camino_tempfile::Utf8TempDir::new().unwrap(), ]; let config_paths = config_dirs .iter() @@ -307,8 +311,8 @@ mod test { // Create the ledger, initialize contents. let mut config_dirs = vec![ - tempfile::TempDir::new().unwrap(), - tempfile::TempDir::new().unwrap(), + camino_tempfile::Utf8TempDir::new().unwrap(), + camino_tempfile::Utf8TempDir::new().unwrap(), ]; let config_paths = config_dirs .iter() diff --git a/sled-agent/src/profile.rs b/sled-agent/src/profile.rs index 36d081c0b76..264072aa756 100644 --- a/sled-agent/src/profile.rs +++ b/sled-agent/src/profile.rs @@ -32,7 +32,7 @@ impl ProfileBuilder { let profile_path = format!( "{zonepath}/root/var/svc/profile/site.xml", - zonepath = installed_zone.zonepath().display() + zonepath = installed_zone.zonepath() ); tokio::fs::write(&profile_path, format!("{self}").as_bytes()).await?; diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 9957d9257fd..7d2705be93e 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -5,15 +5,15 @@ //! Interfaces for working with RSS config. use crate::config::ConfigError; +use camino::Utf8Path; use omicron_common::address::{ get_64_subnet, Ipv6Subnet, AZ_PREFIX, RACK_PREFIX, SLED_PREFIX, }; -use std::path::Path; pub use crate::bootstrap::params::RackInitializeRequest as SetupServiceConfig; impl SetupServiceConfig { - pub fn from_file>(path: P) -> Result { + pub fn from_file>(path: P) -> Result { let path = path.as_ref(); let contents = std::fs::read_to_string(&path) .map_err(|err| ConfigError::Io { path: path.into(), err })?; diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index d79409adbc0..f9fe592ee52 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -10,6 +10,7 @@ use crate::params::{ ZoneType, }; use crate::rack_setup::config::SetupServiceConfig as Config; +use camino::{Utf8Path, Utf8PathBuf}; use dns_service_client::types::DnsConfigParams; use internal_dns::{ServiceName, DNS_ZONE}; use omicron_common::address::{ @@ -32,7 +33,6 @@ use slog::Logger; use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::num::Wrapping; -use std::path::{Path, PathBuf}; use thiserror::Error; use uuid::Uuid; @@ -61,8 +61,9 @@ const PANTRY_COUNT: usize = 1; // when Nexus provisions external DNS zones. const EXTERNAL_DNS_COUNT: usize = 1; -fn rss_service_plan_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH).join("rss-service-plan.toml") +fn rss_service_plan_path() -> Utf8PathBuf { + Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join("rss-service-plan.toml") } /// Describes errors which may occur while generating a plan for services. @@ -76,7 +77,7 @@ pub enum PlanError { }, #[error("Cannot deserialize TOML file at {path}: {err}")] - Toml { path: PathBuf, err: toml::de::Error }, + Toml { path: Utf8PathBuf, err: toml::de::Error }, #[error("Error making HTTP request to Sled Agent: {0}")] SledApi(#[from] SledAgentError), diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index f78a4c24845..80235701730 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -10,17 +10,18 @@ use crate::bootstrap::{ trust_quorum::{RackSecret, ShareDistribution}, }; use crate::rack_setup::config::SetupServiceConfig as Config; +use camino::{Utf8Path, Utf8PathBuf}; use serde::{Deserialize, Serialize}; use slog::Logger; use sprockets_host::Ed25519Certificate; use std::collections::{HashMap, HashSet}; use std::net::{Ipv6Addr, SocketAddrV6}; -use std::path::{Path, PathBuf}; use thiserror::Error; use uuid::Uuid; -fn rss_sled_plan_path() -> PathBuf { - Path::new(omicron_common::OMICRON_CONFIG_PATH).join("rss-sled-plan.toml") +fn rss_sled_plan_path() -> Utf8PathBuf { + Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) + .join("rss-sled-plan.toml") } pub fn generate_rack_secret<'a>( @@ -75,7 +76,7 @@ pub enum PlanError { }, #[error("Cannot deserialize TOML file at {path}: {err}")] - Toml { path: PathBuf, err: toml::de::Error }, + Toml { path: Utf8PathBuf, err: toml::de::Error }, #[error("Failed to split rack secret: {0:?}")] SplitRackSecret(vsss_rs::Error), diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 0f769d9e257..de5e8f99536 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -70,6 +70,7 @@ use crate::rack_setup::plan::service::{ use crate::rack_setup::plan::sled::{ generate_rack_secret, Plan as SledPlan, PlanError as SledPlanError, }; +use camino::{Utf8Path, Utf8PathBuf}; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use internal_dns::resolver::{DnsError, Resolver as DnsResolver}; use internal_dns::ServiceName; @@ -93,7 +94,6 @@ use sprockets_host::Ed25519Certificate; use std::collections::{HashMap, HashSet}; use std::iter; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; -use std::path::PathBuf; use thiserror::Error; /// Describes errors which may occur while operating the setup service. @@ -211,8 +211,8 @@ impl RackSetupService { } } -fn rss_completed_marker_path() -> PathBuf { - std::path::Path::new(omicron_common::OMICRON_CONFIG_PATH) +fn rss_completed_marker_path() -> Utf8PathBuf { + Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) .join("rss-plan-completed.marker") } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 6ef89fc9413..3a24da96ae6 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -34,6 +34,7 @@ use crate::profile::*; use crate::smf_helper::Service; use crate::smf_helper::SmfHelper; use crate::storage_manager::StorageManager; +use camino::{Utf8Path, Utf8PathBuf}; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use dpd_client::{types as DpdTypes, Client as DpdClient, Error as DpdError}; use illumos_utils::addrobj::AddrObject; @@ -81,7 +82,6 @@ use std::collections::HashSet; use std::iter; use std::iter::FromIterator; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; -use std::path::{Path, PathBuf}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; @@ -95,10 +95,10 @@ use uuid::Uuid; #[derive(thiserror::Error, Debug)] pub enum Error { #[error("Cannot serialize TOML to file {path}: {err}")] - TomlSerialize { path: PathBuf, err: toml::ser::Error }, + TomlSerialize { path: Utf8PathBuf, err: toml::ser::Error }, #[error("Cannot deserialize TOML from file {path}: {err}")] - TomlDeserialize { path: PathBuf, err: toml::de::Error }, + TomlDeserialize { path: Utf8PathBuf, err: toml::de::Error }, #[error("Failed to perform I/O: {message}: {err}")] Io { @@ -182,8 +182,8 @@ impl Error { fn io(message: &str, err: std::io::Error) -> Self { Self::Io { message: message.to_string(), err } } - fn io_path(path: &Path, err: std::io::Error) -> Self { - Self::Io { message: format!("Error accessing {}", path.display()), err } + fn io_path(path: &Utf8Path, err: std::io::Error) -> Self { + Self::Io { message: format!("Error accessing {path}"), err } } } @@ -251,7 +251,7 @@ impl Ledgerable for AllZoneRequests { struct ZoneRequest { zone: ServiceZoneRequest, // TODO: Consider collapsing "root" into ServiceZoneRequest - root: PathBuf, + root: Utf8PathBuf, } struct Task { @@ -321,7 +321,7 @@ pub struct ServiceManagerInner { // need this interface to provision Zone filesystems on explicit U.2s, // rather than simply placing them on the ramdisk. storage: StorageManager, - ledger_directory_override: OnceCell, + ledger_directory_override: OnceCell, } // Late-binding information, only known once the sled agent is up and @@ -403,7 +403,7 @@ impl ServiceManager { } #[cfg(test)] - async fn override_ledger_directory(&self, path: PathBuf) { + async fn override_ledger_directory(&self, path: Utf8PathBuf) { self.inner.ledger_directory_override.set(path).unwrap(); } @@ -411,7 +411,7 @@ impl ServiceManager { self.inner.switch_zone_bootstrap_address } - async fn all_service_ledgers(&self) -> Vec { + async fn all_service_ledgers(&self) -> Vec { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; } @@ -425,7 +425,7 @@ impl ServiceManager { .collect() } - async fn all_storage_service_ledgers(&self) -> Vec { + async fn all_storage_service_ledgers(&self) -> Vec { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(STORAGE_SERVICES_LEDGER_FILENAME)]; } @@ -614,7 +614,7 @@ impl ServiceManager { } for dev in &devices { - if !Path::new(dev).exists() { + if !Utf8Path::new(dev).exists() { return Err(Error::MissingDevice { device: dev.to_string() }); } } @@ -869,7 +869,7 @@ impl ServiceManager { let service = DnsClient {}; let smfh = SmfHelper::new(&running_zone, &service); - let etc = PathBuf::from(running_zone.root()).join("etc"); + let etc = Utf8PathBuf::from(running_zone.root()).join("etc"); let resolv_conf = etc.join("resolv.conf"); let nsswitch_conf = etc.join("nsswitch.conf"); let nsswitch_dns = etc.join("nsswitch.dns"); @@ -887,7 +887,7 @@ impl ServiceManager { config.push_str(&format!("nameserver {s}\n")); } - debug!(self.inner.log, "creating {}", resolv_conf.display()); + debug!(self.inner.log, "creating {resolv_conf}"); tokio::fs::write(&resolv_conf, config) .await .map_err(|err| Error::io_path(&resolv_conf, err))?; @@ -1333,7 +1333,7 @@ impl ServiceManager { }; // Copy the partial config file to the expected location. - let config_dir = PathBuf::from(format!( + let config_dir = Utf8PathBuf::from(format!( "{}/var/svc/manifest/site/nexus", running_zone.root() )); @@ -1829,7 +1829,7 @@ impl ServiceManager { let mut zone_requests = AllZoneRequests::default(); for zone in new_zone_requests.into_iter() { - let root = PathBuf::from(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT); + let root = Utf8PathBuf::from(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT); zone_requests.requests.push(ZoneRequest { zone, root }); } @@ -2285,7 +2285,7 @@ impl ServiceManager { let SledLocalZone::Initializing { request, filesystems, .. } = &*sled_zone else { return Ok(()) }; - let root = PathBuf::from(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT); + let root = Utf8PathBuf::from(ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT); let request = ZoneRequest { zone: request.clone(), root }; let zone = self.initialize_zone(&request, filesystems).await?; *sled_zone = @@ -2466,12 +2466,12 @@ mod test { } struct TestConfig { - config_dir: tempfile::TempDir, + config_dir: camino_tempfile::Utf8TempDir, } impl TestConfig { async fn new() -> Self { - let config_dir = tempfile::TempDir::new().unwrap(); + let config_dir = camino_tempfile::Utf8TempDir::new().unwrap(); Self { config_dir } } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 8b4d5636fac..4c0a233b0f1 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -17,6 +17,7 @@ use crate::params::{ use crate::services::{self, ServiceManager}; use crate::storage_manager::{self, StorageManager}; use crate::updates::{ConfigUpdates, UpdateManager}; +use camino::Utf8PathBuf; use dropshot::HttpError; use illumos_utils::opte::params::SetVirtualNetworkInterfaceHost; use illumos_utils::opte::PortManager; @@ -34,7 +35,6 @@ use sled_hardware::underlay; use sled_hardware::HardwareManager; use slog::Logger; use std::net::{Ipv6Addr, SocketAddrV6}; -use std::path::PathBuf; use std::sync::Arc; use uuid::Uuid; @@ -265,8 +265,9 @@ impl SledAgent { let hardware = HardwareManager::new(&parent_log, services.sled_mode()) .map_err(|e| Error::Hardware(e))?; - let update_config = - ConfigUpdates { zone_artifact_path: PathBuf::from("/opt/oxide") }; + let update_config = ConfigUpdates { + zone_artifact_path: Utf8PathBuf::from("/opt/oxide"), + }; let updates = UpdateManager::new(update_config); let svc_config = services::Config::new( diff --git a/sled-agent/src/sp/mod.rs b/sled-agent/src/sp/mod.rs index 848ac7565ff..c8e86e3237e 100644 --- a/sled-agent/src/sp/mod.rs +++ b/sled-agent/src/sp/mod.rs @@ -5,6 +5,7 @@ //! Interface to a (simulated or real) SP / RoT. use crate::config::ConfigError; +use camino::Utf8Path; use illumos_utils::dladm::CreateVnicError; use illumos_utils::zone::EnsureGzAddressError; use serde::Deserialize; @@ -17,7 +18,6 @@ use sprockets_host::Ed25519PublicKey; use sprockets_host::RotManagerHandle; use sprockets_host::Session; use std::net::Ipv6Addr; -use std::path::Path; use std::time::Duration; use thiserror::Error; use tokio::io::AsyncRead; @@ -35,7 +35,7 @@ pub struct SimSpConfig { } impl SimSpConfig { - pub fn from_file>(path: P) -> Result { + pub fn from_file>(path: P) -> Result { let path = path.as_ref(); let contents = std::fs::read_to_string(&path) .map_err(|err| ConfigError::Io { path: path.into(), err })?; diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 0c58405e6d0..73ddd75cdd4 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -7,6 +7,7 @@ use crate::nexus::LazyNexusClient; use crate::params::DatasetKind; use crate::storage::dataset::DatasetName; +use camino::Utf8PathBuf; use futures::stream::FuturesOrdered; use futures::FutureExt; use futures::StreamExt; @@ -23,7 +24,6 @@ use slog::Logger; use std::collections::hash_map; use std::collections::HashMap; use std::convert::TryFrom; -use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; use tokio::sync::{mpsc, oneshot, Mutex}; @@ -76,7 +76,7 @@ pub enum Error { #[error("Failed to parse UUID from {path}: {err}")] ParseUuid { - path: PathBuf, + path: Utf8PathBuf, #[source] err: uuid::Error, }, @@ -103,14 +103,14 @@ pub enum Error { #[error("Failed to serialize toml (intended for {path:?}): {err}")] Serialize { - path: PathBuf, + path: Utf8PathBuf, #[source] err: toml::ser::Error, }, #[error("Failed to deserialize toml from {path:?}: {err}")] Deserialize { - path: PathBuf, + path: Utf8PathBuf, #[source] err: toml::de::Error, }, @@ -166,7 +166,7 @@ struct UnderlayRequest { #[derive(PartialEq, Eq, Clone)] enum DiskWrapper { - Real { disk: Disk, devfs_path: PathBuf }, + Real { disk: Disk, devfs_path: Utf8PathBuf }, Synthetic { zpool_name: ZpoolName }, } @@ -226,7 +226,7 @@ impl StorageResources { } /// Returns all mountpoints within all M.2s for a particular dataset. - pub async fn all_m2_mountpoints(&self, dataset: &str) -> Vec { + pub async fn all_m2_mountpoints(&self, dataset: &str) -> Vec { let m2_zpools = self.all_m2_zpools().await; m2_zpools .iter() @@ -283,7 +283,7 @@ impl StorageWorker { let do_format = true; Zfs::ensure_filesystem( &dataset_name.full(), - Mountpoint::Path(PathBuf::from("/data")), + Mountpoint::Path(Utf8PathBuf::from("/data")), zoned, do_format, )?; diff --git a/sled-agent/src/updates.rs b/sled-agent/src/updates.rs index 75819d30890..43eeac17fcd 100644 --- a/sled-agent/src/updates.rs +++ b/sled-agent/src/updates.rs @@ -5,6 +5,8 @@ //! Management of per-sled updates use crate::nexus::NexusClient; +use camino::{Utf8Path, Utf8PathBuf}; +use camino_tempfile::NamedUtf8TempFile; use futures::{TryFutureExt, TryStreamExt}; use omicron_common::api::external::SemverVersion; use omicron_common::api::internal::nexus::{ @@ -13,8 +15,6 @@ use omicron_common::api::internal::nexus::{ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::io::Read; -use std::path::{Path, PathBuf}; -use tempfile::NamedTempFile; use tokio::io::AsyncWriteExt; #[derive(thiserror::Error, Debug)] @@ -26,37 +26,40 @@ pub enum Error { err: std::io::Error, }, + #[error("Utf-8 error converting path: {0}")] + FromPathBuf(#[from] camino::FromPathBufError), + #[error( "sled-agent only supports applying zones, found artifact ID {}/{} with kind {}", .0.name, .0.version, .0.kind )] UnsupportedKind(UpdateArtifactId), - #[error("Version not found in artifact {}", .0.display())] - VersionNotFound(PathBuf), + #[error("Version not found in artifact {}", 0)] + VersionNotFound(Utf8PathBuf), #[error("Cannot parse json: {0}")] Json(#[from] serde_json::Error), - #[error("Malformed version in artifact {path}: {why}", path = path.display())] - VersionMalformed { path: PathBuf, why: String }, + #[error("Malformed version in artifact {path}: {why}")] + VersionMalformed { path: Utf8PathBuf, why: String }, - #[error("Cannot parse semver in {path}: {err}", path = path.display())] - Semver { path: PathBuf, err: semver::Error }, + #[error("Cannot parse semver in {path}: {err}")] + Semver { path: Utf8PathBuf, err: semver::Error }, #[error("Failed request to Nexus: {0}")] Response(nexus_client::Error), } -fn default_zone_artifact_path() -> PathBuf { - PathBuf::from("/opt/oxide") +fn default_zone_artifact_path() -> Utf8PathBuf { + Utf8PathBuf::from("/opt/oxide") } #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] pub struct ConfigUpdates { // Path where zone artifacts are stored. #[serde(default = "default_zone_artifact_path")] - pub zone_artifact_path: PathBuf, + pub zone_artifact_path: Utf8PathBuf, } impl Default for ConfigUpdates { @@ -72,15 +75,15 @@ pub struct Component { } // Helper functions for returning errors -fn version_malformed_err(path: &Path, key: &str) -> Error { +fn version_malformed_err(path: &Utf8Path, key: &str) -> Error { Error::VersionMalformed { path: path.to_path_buf(), why: format!("Missing '{key}'"), } } -fn io_err(path: &Path, err: std::io::Error) -> Error { - Error::Io { message: format!("Cannot access {}", path.display()), err } +fn io_err(path: &Utf8Path, err: std::io::Error) -> Error { + Error::Io { message: format!("Cannot access {path}"), err } } pub struct UpdateManager { @@ -111,7 +114,7 @@ impl UpdateManager { // We download the file to a temporary file. We then rename it to // "" after it has successfully downloaded, to // signify that it is ready for usage. - let (file, temp_path) = NamedTempFile::new_in(&directory) + let (file, temp_path) = NamedUtf8TempFile::new_in(&directory) .map_err(|err| Error::Io { message: "create temp file".to_string(), err, @@ -169,7 +172,7 @@ impl UpdateManager { // Gets the component version information from a single zone artifact. async fn component_get_zone_version( &self, - path: &Path, + path: &Utf8Path, ) -> Result { // Decode the zone image let file = @@ -183,7 +186,7 @@ impl UpdateManager { for entry in entries { let mut entry = entry.map_err(|err| io_err(path, err))?; let entry_path = entry.path().map_err(|err| io_err(path, err))?; - if entry_path == Path::new("oxide.json") { + if entry_path == Utf8Path::new("oxide.json") { let mut contents = String::new(); entry .read_to_string(&mut contents) @@ -220,6 +223,7 @@ impl UpdateManager { let entry = entry.map_err(|err| io_err(dir, err))?; let file_type = entry.file_type().map_err(|err| io_err(dir, err))?; + let path: Utf8PathBuf = entry.path().try_into()?; if file_type.is_file() && entry.file_name().to_string_lossy().ends_with(".tar.gz") @@ -228,14 +232,12 @@ impl UpdateManager { // // This logic may be tweaked in the future, depending on how we // bundle together zones. - components.push( - self.component_get_zone_version(&entry.path()).await?, - ); + components.push(self.component_get_zone_version(&path).await?); } else if file_type.is_dir() && entry.file_name().to_string_lossy() == "sled-agent" { // Sled Agent is the only non-zone file recognized as a component. - let version_path = entry.path().join("VERSION"); + let version_path = path.join("VERSION"); let version = tokio::fs::read_to_string(&version_path) .await .map_err(|err| io_err(&version_path, err))?; @@ -280,7 +282,8 @@ mod test { kind: KnownArtifactKind::ControlPlane, }; - let tempdir = tempfile::tempdir().expect("Failed to make tempdir"); + let tempdir = + camino_tempfile::tempdir().expect("Failed to make tempdir"); let expected_path = tempdir.path().join(expected_name); // Remove the file if it already exists. @@ -320,7 +323,8 @@ mod test { #[tokio::test] async fn test_query_no_components() { - let tempdir = tempfile::tempdir().expect("Failed to make tempdir"); + let tempdir = + camino_tempfile::tempdir().expect("Failed to make tempdir"); let config = ConfigUpdates { zone_artifact_path: tempdir.path().to_path_buf() }; let um = UpdateManager::new(config); @@ -331,7 +335,8 @@ mod test { #[tokio::test] async fn test_query_zone_version() { - let tempdir = tempfile::tempdir().expect("Failed to make tempdir"); + let tempdir = + camino_tempfile::tempdir().expect("Failed to make tempdir"); // Construct something that looks like a zone image in the tempdir. let zone_path = tempdir.path().join("test-pkg.tar.gz"); @@ -340,14 +345,14 @@ mod test { let mut archive = Builder::new(gzw); archive.mode(tar::HeaderMode::Deterministic); - let mut json = tempfile::NamedTempFile::new().unwrap(); + let mut json = NamedUtf8TempFile::new().unwrap(); json.write_all( &r#"{"v":"1","t":"layer","pkg":"test-pkg","version":"2.0.0"}"# .as_bytes(), ) .unwrap(); archive.append_path_with_name(json.path(), "oxide.json").unwrap(); - let mut other_data = tempfile::NamedTempFile::new().unwrap(); + let mut other_data = NamedUtf8TempFile::new().unwrap(); other_data .write_all("lets throw in another file for good measure".as_bytes()) .unwrap(); @@ -373,7 +378,8 @@ mod test { #[tokio::test] async fn test_query_sled_agent_version() { - let tempdir = tempfile::tempdir().expect("Failed to make tempdir"); + let tempdir = + camino_tempfile::tempdir().expect("Failed to make tempdir"); // Construct something that looks like the sled agent. let sled_agent_dir = tempdir.path().join("sled-agent"); diff --git a/sled-agent/tests/integration_tests/commands.rs b/sled-agent/tests/integration_tests/commands.rs index 559ca84daf2..132c3d78e48 100644 --- a/sled-agent/tests/integration_tests/commands.rs +++ b/sled-agent/tests/integration_tests/commands.rs @@ -8,6 +8,7 @@ // TODO-coverage: test success cases of sled-agent +use camino::Utf8PathBuf; use expectorate::assert_contents; use omicron_test_utils::dev::test_cmds::assert_exit_code; use omicron_test_utils::dev::test_cmds::path_to_executable; @@ -15,14 +16,15 @@ use omicron_test_utils::dev::test_cmds::run_command; use omicron_test_utils::dev::test_cmds::EXIT_SUCCESS; use omicron_test_utils::dev::test_cmds::EXIT_USAGE; use openapiv3::OpenAPI; -use std::path::PathBuf; use subprocess::Exec; /// name of the "sled-agent-sim" executable const CMD_SLED_AGENT_SIM: &str = env!("CARGO_BIN_EXE_sled-agent-sim"); -fn path_to_sled_agent_sim() -> PathBuf { +fn path_to_sled_agent_sim() -> Utf8PathBuf { path_to_executable(CMD_SLED_AGENT_SIM) + .try_into() + .expect("Invalid Utf8 binary?") } #[test] @@ -42,8 +44,8 @@ fn test_sled_agent_sim_no_args() { /// name of the "sled-agent" executable const CMD_SLED_AGENT: &str = env!("CARGO_BIN_EXE_sled-agent"); -fn path_to_sled_agent() -> PathBuf { - path_to_executable(CMD_SLED_AGENT) +fn path_to_sled_agent() -> Utf8PathBuf { + path_to_executable(CMD_SLED_AGENT).try_into().expect("Invalid Utf8 binary?") } #[test] diff --git a/sled-hardware/Cargo.toml b/sled-hardware/Cargo.toml index 064d4cb8a7d..941ac74a072 100644 --- a/sled-hardware/Cargo.toml +++ b/sled-hardware/Cargo.toml @@ -7,6 +7,7 @@ license = "MPL-2.0" [dependencies] anyhow.workspace = true +camino.workspace = true cfg-if.workspace = true futures.workspace = true illumos-utils.workspace = true diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index e3098329930..aad78a1b174 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use camino::{Utf8Path, Utf8PathBuf}; use illumos_utils::fstyp::Fstyp; use illumos_utils::zfs::Mountpoint; use illumos_utils::zfs::Zfs; @@ -10,7 +11,6 @@ use illumos_utils::zpool::ZpoolKind; use illumos_utils::zpool::ZpoolName; use slog::Logger; use slog::{info, warn}; -use std::path::{Path, PathBuf}; use uuid::Uuid; cfg_if::cfg_if! { @@ -24,13 +24,13 @@ cfg_if::cfg_if! { #[derive(Debug, thiserror::Error)] pub enum DiskError { #[error("Cannot open {path} due to {error}")] - IoError { path: PathBuf, error: std::io::Error }, + IoError { path: Utf8PathBuf, error: std::io::Error }, #[error("Failed to open partition at {path} due to {error}")] - Gpt { path: PathBuf, error: anyhow::Error }, + Gpt { path: Utf8PathBuf, error: anyhow::Error }, #[error("Unexpected partition layout at {path}: {why}")] - BadPartitionLayout { path: PathBuf, why: String }, + BadPartitionLayout { path: Utf8PathBuf, why: String }, #[error("Requested partition {partition:?} not found on device {path}")] - NotFound { path: PathBuf, partition: Partition }, + NotFound { path: Utf8PathBuf, partition: Partition }, #[error(transparent)] EnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), #[error(transparent)] @@ -38,7 +38,7 @@ pub enum DiskError { #[error("Cannot import zpool: {0}")] ZpoolImport(illumos_utils::zpool::Error), #[error("Cannot format {path}: missing a '/dev' path")] - CannotFormatMissingDevPath { path: PathBuf }, + CannotFormatMissingDevPath { path: Utf8PathBuf }, #[error("Formatting M.2 devices is not yet implemented")] CannotFormatM2NotImplemented, } @@ -61,9 +61,9 @@ pub enum Partition { pub struct DiskPaths { // Full path to the disk under "/devices". // Should NOT end with a ":partition_letter". - pub devfs_path: PathBuf, + pub devfs_path: Utf8PathBuf, // Optional path to the disk under "/dev/dsk". - pub dev_path: Option, + pub dev_path: Option, } /// Uniquely identifies a disk. @@ -76,15 +76,15 @@ pub struct DiskIdentity { impl DiskPaths { // Returns the "illumos letter-indexed path" for a device. - fn partition_path(&self, index: usize, raw: bool) -> Option { + fn partition_path(&self, index: usize, raw: bool) -> Option { let index = u8::try_from(index).ok()?; - let path = self.devfs_path.display(); + let path = &self.devfs_path; let character = match index { 0..=5 => (b'a' + index) as char, _ => return None, }; - Some(PathBuf::from(format!( + Some(Utf8PathBuf::from(format!( "{path}:{character}{suffix}", suffix = if raw { ",raw" } else { "" } ))) @@ -92,10 +92,10 @@ impl DiskPaths { /// Returns the path to the whole disk #[allow(dead_code)] - pub(crate) fn whole_disk(&self, raw: bool) -> PathBuf { - PathBuf::from(format!( + pub(crate) fn whole_disk(&self, raw: bool) -> Utf8PathBuf { + let path = &self.devfs_path; + Utf8PathBuf::from(format!( "{path}:wd{raw}", - path = self.devfs_path.display(), raw = if raw { ",raw" } else { "" }, )) } @@ -106,7 +106,7 @@ impl DiskPaths { partitions: &[Partition], expected_partition: Partition, raw: bool, - ) -> Result { + ) -> Result { for (index, partition) in partitions.iter().enumerate() { if &expected_partition == partition { let path = @@ -143,8 +143,8 @@ pub struct UnparsedDisk { impl UnparsedDisk { #[allow(dead_code)] pub fn new( - devfs_path: PathBuf, - dev_path: Option, + devfs_path: Utf8PathBuf, + dev_path: Option, slot: i64, variant: DiskVariant, identity: DiskIdentity, @@ -157,7 +157,7 @@ impl UnparsedDisk { } } - pub fn devfs_path(&self) -> &PathBuf { + pub fn devfs_path(&self) -> &Utf8PathBuf { &self.paths.devfs_path } @@ -266,7 +266,7 @@ impl Disk { fn ensure_zpool_exists( log: &Logger, variant: DiskVariant, - zpool_path: &Path, + zpool_path: &Utf8Path, ) -> Result { let zpool_name = match Fstyp::get_zpool(&zpool_path) { Ok(zpool_name) => zpool_name, @@ -285,7 +285,7 @@ impl Disk { info!( log, "GPT exists without Zpool: formatting zpool at {}", - zpool_path.display(), + zpool_path, ); // If a zpool does not already exist, create one. let zpool_name = match variant { @@ -347,7 +347,7 @@ impl Disk { self.variant } - pub fn devfs_path(&self) -> &PathBuf { + pub fn devfs_path(&self) -> &Utf8PathBuf { &self.paths.devfs_path } @@ -358,7 +358,7 @@ impl Disk { pub fn boot_image_devfs_path( &self, raw: bool, - ) -> Result { + ) -> Result { self.paths.partition_device_path( &self.partitions, Partition::BootImage, @@ -394,65 +394,67 @@ mod test { #[test] fn test_disk_paths() { const DEVFS_PATH: &'static str = "/devices/my/disk"; - let paths = - DiskPaths { devfs_path: PathBuf::from(DEVFS_PATH), dev_path: None }; + let paths = DiskPaths { + devfs_path: Utf8PathBuf::from(DEVFS_PATH), + dev_path: None, + }; assert_eq!( paths.whole_disk(false), - PathBuf::from(format!("{DEVFS_PATH}:wd")) + Utf8PathBuf::from(format!("{DEVFS_PATH}:wd")) ); assert_eq!( paths.whole_disk(true), - PathBuf::from(format!("{DEVFS_PATH}:wd,raw")) + Utf8PathBuf::from(format!("{DEVFS_PATH}:wd,raw")) ); assert_eq!( paths.partition_path(0, false), - Some(PathBuf::from(format!("{DEVFS_PATH}:a"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:a"))) ); assert_eq!( paths.partition_path(1, false), - Some(PathBuf::from(format!("{DEVFS_PATH}:b"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:b"))) ); assert_eq!( paths.partition_path(2, false), - Some(PathBuf::from(format!("{DEVFS_PATH}:c"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:c"))) ); assert_eq!( paths.partition_path(3, false), - Some(PathBuf::from(format!("{DEVFS_PATH}:d"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:d"))) ); assert_eq!( paths.partition_path(4, false), - Some(PathBuf::from(format!("{DEVFS_PATH}:e"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:e"))) ); assert_eq!( paths.partition_path(5, false), - Some(PathBuf::from(format!("{DEVFS_PATH}:f"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:f"))) ); assert_eq!(paths.partition_path(6, false), None); assert_eq!( paths.partition_path(0, true), - Some(PathBuf::from(format!("{DEVFS_PATH}:a,raw"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:a,raw"))) ); assert_eq!( paths.partition_path(1, true), - Some(PathBuf::from(format!("{DEVFS_PATH}:b,raw"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:b,raw"))) ); assert_eq!( paths.partition_path(2, true), - Some(PathBuf::from(format!("{DEVFS_PATH}:c,raw"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:c,raw"))) ); assert_eq!( paths.partition_path(3, true), - Some(PathBuf::from(format!("{DEVFS_PATH}:d,raw"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:d,raw"))) ); assert_eq!( paths.partition_path(4, true), - Some(PathBuf::from(format!("{DEVFS_PATH}:e,raw"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:e,raw"))) ); assert_eq!( paths.partition_path(5, true), - Some(PathBuf::from(format!("{DEVFS_PATH}:f,raw"))) + Some(Utf8PathBuf::from(format!("{DEVFS_PATH}:f,raw"))) ); assert_eq!(paths.partition_path(6, true), None); } @@ -460,8 +462,10 @@ mod test { #[test] fn test_partition_device_paths() { const DEVFS_PATH: &'static str = "/devices/my/disk"; - let paths = - DiskPaths { devfs_path: PathBuf::from(DEVFS_PATH), dev_path: None }; + let paths = DiskPaths { + devfs_path: Utf8PathBuf::from(DEVFS_PATH), + dev_path: None, + }; assert_eq!( paths diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index 49abc33fa1a..6bd22fa56d4 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -6,6 +6,7 @@ use crate::{ Baseboard, DendriteAsic, DiskIdentity, DiskVariant, HardwareUpdate, SledMode, UnparsedDisk, }; +use camino::Utf8PathBuf; use illumos_devinfo::{DevInfo, DevLinkType, DevLinks, Node, Property}; use slog::debug; use slog::error; @@ -14,7 +15,6 @@ use slog::o; use slog::warn; use slog::Logger; use std::collections::{HashMap, HashSet}; -use std::path::PathBuf; use std::sync::Arc; use std::sync::Mutex; use tokio::sync::broadcast; @@ -34,6 +34,9 @@ enum Error { #[error("Device does not appear to be an Oxide Gimlet: {0}")] NotAGimlet(String), + #[error("Invalid Utf8 path: {0}")] + FromPathBuf(#[from] camino::FromPathBufError), + #[error("Node {node} missing device property {name}")] MissingDeviceProperty { node: String, name: String }, @@ -44,7 +47,7 @@ enum Error { UnexpectedPropertyType { name: String, ty: String }, #[error("Could not translate {0} to '/dev' path: no links")] - NoDevLinks(PathBuf), + NoDevLinks(Utf8PathBuf), #[error("Failed to issue request to sysconf: {0}")] SysconfError(#[from] sysconf::Error), @@ -262,7 +265,7 @@ fn get_tofino_snapshot(log: &Logger, devinfo: &mut DevInfo) -> TofinoSnapshot { fn get_dev_path_of_whole_disk( node: &Node<'_>, -) -> Result, Error> { +) -> Result, Error> { let mut wm = node.minors(); while let Some(m) = wm.next().transpose().map_err(Error::DevInfo)? { // "wd" stands for "whole disk" @@ -298,9 +301,9 @@ fn get_dev_path_of_whole_disk( .collect::>(); if paths.is_empty() { - return Err(Error::NoDevLinks(PathBuf::from(devfs_path))); + return Err(Error::NoDevLinks(Utf8PathBuf::from(devfs_path))); } - return Ok(Some(paths[0].path().to_path_buf())); + return Ok(Some(paths[0].path().to_path_buf().try_into()?)); } Ok(None) } @@ -443,7 +446,7 @@ fn poll_blkdev_node( }; let disk = UnparsedDisk::new( - PathBuf::from(&devfs_path), + Utf8PathBuf::from(&devfs_path), dev_path, slot, variant, diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs index 1297cebe00c..950074bd3ad 100644 --- a/sled-hardware/src/illumos/partitions.rs +++ b/sled-hardware/src/illumos/partitions.rs @@ -6,10 +6,10 @@ use crate::illumos::gpt; use crate::{DiskError, DiskPaths, DiskVariant, Partition}; +use camino::Utf8Path; use illumos_utils::zpool::ZpoolName; use slog::info; use slog::Logger; -use std::path::Path; use uuid::Uuid; #[cfg(test)] @@ -38,7 +38,7 @@ static U2_EXPECTED_PARTITIONS: [Partition; U2_EXPECTED_PARTITION_COUNT] = [Partition::ZfsPool]; fn parse_partition_types( - path: &Path, + path: &Utf8Path, partitions: &Vec, expected_partitions: &[Partition; N], ) -> Result, DiskError> { @@ -100,21 +100,13 @@ fn internal_ensure_partition_layout( let gpt = match GPT::read(&path) { Ok(gpt) => { // This should be the common steady-state case - info!( - log, - "Disk at {} already has a GPT", - paths.devfs_path.display() - ); + info!(log, "Disk at {} already has a GPT", paths.devfs_path); gpt } Err(libefi_illumos::Error::LabelNotFound) => { // Fresh U.2 disks are an example of devices where "we don't expect // a GPT to exist". - info!( - log, - "Disk at {} does not have a GPT", - paths.devfs_path.display() - ); + info!(log, "Disk at {} does not have a GPT", paths.devfs_path); // For ZFS-implementation-specific reasons, Zpool create can only // act on devices under the "/dev" hierarchy, rather than the device @@ -126,11 +118,7 @@ fn internal_ensure_partition_layout( }; match variant { DiskVariant::U2 => { - info!( - log, - "Formatting zpool on disk {}", - paths.devfs_path.display() - ); + info!(log, "Formatting zpool on disk {}", paths.devfs_path); // If a zpool does not already exist, create one. let zpool_name = ZpoolName::new_external(Uuid::new_v4()); Zpool::create(zpool_name, dev_path)?; @@ -169,9 +157,10 @@ fn internal_ensure_partition_layout( mod test { use super::*; use crate::DiskPaths; + use camino::Utf8PathBuf; use illumos_utils::zpool::MockZpool; use omicron_test_utils::dev::test_setup_log; - use std::path::PathBuf; + use std::path::Path; struct FakePartition { index: usize, @@ -201,7 +190,7 @@ mod test { ); let log = &logctx.log; - let devfs_path = PathBuf::from("/devfs/path"); + let devfs_path = Utf8PathBuf::from("/devfs/path"); let result = internal_ensure_partition_layout::( &log, &DiskPaths { devfs_path, dev_path: None }, @@ -222,20 +211,23 @@ mod test { test_setup_log("ensure_partition_layout_u2_format_with_dev_path"); let log = &logctx.log; - let devfs_path = PathBuf::from("/devfs/path"); + let devfs_path = Utf8PathBuf::from("/devfs/path"); const DEV_PATH: &'static str = "/dev/path"; // We expect that formatting a zpool will involve calling // "Zpool::create" with the provided "dev_path". let create_ctx = MockZpool::create_context(); create_ctx.expect().return_once(|_, observed_dev_path| { - assert_eq!(&PathBuf::from(DEV_PATH), observed_dev_path); + assert_eq!(&Utf8PathBuf::from(DEV_PATH), observed_dev_path); Ok(()) }); let partitions = internal_ensure_partition_layout::( &log, - &DiskPaths { devfs_path, dev_path: Some(PathBuf::from(DEV_PATH)) }, + &DiskPaths { + devfs_path, + dev_path: Some(Utf8PathBuf::from(DEV_PATH)), + }, DiskVariant::U2, ) .expect("Should have succeeded partitioning disk"); @@ -251,12 +243,15 @@ mod test { let logctx = test_setup_log("ensure_partition_layout_m2_cannot_format"); let log = &logctx.log.clone(); - let devfs_path = PathBuf::from("/devfs/path"); + let devfs_path = Utf8PathBuf::from("/devfs/path"); const DEV_PATH: &'static str = "/dev/path"; assert!(internal_ensure_partition_layout::( &log, - &DiskPaths { devfs_path, dev_path: Some(PathBuf::from(DEV_PATH)) }, + &DiskPaths { + devfs_path, + dev_path: Some(Utf8PathBuf::from(DEV_PATH)) + }, DiskVariant::M2, ) .is_err()); @@ -285,12 +280,15 @@ mod test { test_setup_log("ensure_partition_layout_u2_with_expected_format"); let log = &logctx.log; - let devfs_path = PathBuf::from("/devfs/path"); + let devfs_path = Utf8PathBuf::from("/devfs/path"); const DEV_PATH: &'static str = "/dev/path"; let partitions = internal_ensure_partition_layout::( &log, - &DiskPaths { devfs_path, dev_path: Some(PathBuf::from(DEV_PATH)) }, + &DiskPaths { + devfs_path, + dev_path: Some(Utf8PathBuf::from(DEV_PATH)), + }, DiskVariant::U2, ) .expect("Should be able to parse disk"); @@ -324,12 +322,15 @@ mod test { test_setup_log("ensure_partition_layout_m2_with_expected_format"); let log = &logctx.log; - let devfs_path = PathBuf::from("/devfs/path"); + let devfs_path = Utf8PathBuf::from("/devfs/path"); const DEV_PATH: &'static str = "/dev/path"; let partitions = internal_ensure_partition_layout::( &log, - &DiskPaths { devfs_path, dev_path: Some(PathBuf::from(DEV_PATH)) }, + &DiskPaths { + devfs_path, + dev_path: Some(Utf8PathBuf::from(DEV_PATH)), + }, DiskVariant::M2, ) .expect("Should be able to parse disk"); @@ -359,7 +360,7 @@ mod test { test_setup_log("ensure_partition_layout_m2_fails_with_empty_gpt"); let log = &logctx.log; - let devfs_path = PathBuf::from("/devfs/path"); + let devfs_path = Utf8PathBuf::from("/devfs/path"); const DEV_PATH: &'static str = "/dev/path"; assert!(matches!( @@ -367,7 +368,7 @@ mod test { &log, &DiskPaths { devfs_path, - dev_path: Some(PathBuf::from(DEV_PATH)), + dev_path: Some(Utf8PathBuf::from(DEV_PATH)), }, DiskVariant::M2, ) @@ -384,7 +385,7 @@ mod test { test_setup_log("ensure_partition_layout_u2_fails_with_empty_gpt"); let log = &logctx.log; - let devfs_path = PathBuf::from("/devfs/path"); + let devfs_path = Utf8PathBuf::from("/devfs/path"); const DEV_PATH: &'static str = "/dev/path"; assert!(matches!( @@ -392,7 +393,7 @@ mod test { &log, &DiskPaths { devfs_path, - dev_path: Some(PathBuf::from(DEV_PATH)), + dev_path: Some(Utf8PathBuf::from(DEV_PATH)), }, DiskVariant::U2, ) From ef6756d10be82a0c9fea1df597989acccaee1260 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 May 2023 16:51:12 -0400 Subject: [PATCH 15/39] Fix indexing --- sled-agent/src/ledger.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/ledger.rs b/sled-agent/src/ledger.rs index 5462630b553..197c7607e4b 100644 --- a/sled-agent/src/ledger.rs +++ b/sled-agent/src/ledger.rs @@ -281,7 +281,7 @@ mod test { drop(ledger); // Let's write again, but only using one of the two config dirs. - let mut ledger = Ledger::::new(&log, config_paths[..=1].to_vec()) + let mut ledger = Ledger::::new(&log, config_paths[..1].to_vec()) .await .expect("Failed to create ledger"); ledger.data_mut().contents = "even newer contents".to_string(); From c621c53a77babd233bd82f1b7f9e563ebe57dd20 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 May 2023 18:14:44 -0400 Subject: [PATCH 16/39] end-to-end tests too --- Cargo.lock | 1 + end-to-end-tests/Cargo.toml | 1 + end-to-end-tests/src/helpers/ctx.rs | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d1192ce4daf..a5aace1d17a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2138,6 +2138,7 @@ dependencies = [ "anyhow", "async-trait", "base64 0.21.0", + "camino", "omicron-sled-agent", "omicron-test-utils", "oxide-client", diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index dcef7071b1a..b727bf7213c 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -8,6 +8,7 @@ license = "MPL-2.0" anyhow = { workspace = true, features = ["backtrace"] } async-trait.workspace = true base64.workspace = true +camino.workspace = true omicron-sled-agent.workspace = true omicron-test-utils.workspace = true oxide-client.workspace = true diff --git a/end-to-end-tests/src/helpers/ctx.rs b/end-to-end-tests/src/helpers/ctx.rs index 062b8d3053c..99fd5e3ceeb 100644 --- a/end-to-end-tests/src/helpers/ctx.rs +++ b/end-to-end-tests/src/helpers/ctx.rs @@ -1,12 +1,12 @@ use crate::helpers::generate_name; use anyhow::{Context as _, Result}; +use camino::Utf8Path; use omicron_sled_agent::rack_setup::config::SetupServiceConfig; use oxide_client::types::{Name, ProjectCreate}; use oxide_client::{Client, ClientProjectsExt, ClientVpcsExt}; use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION}; use reqwest::Url; use std::net::SocketAddr; -use std::path::Path; use std::time::Duration; #[derive(Clone)] @@ -86,7 +86,7 @@ pub fn nexus_addr() -> SocketAddr { // If we can find config-rss.toml, grab the second address from the // configured services IP pool. - let rss_config_path = Path::new(env!("CARGO_MANIFEST_DIR")) + let rss_config_path = Utf8Path::new(env!("CARGO_MANIFEST_DIR")) .join("../smf/sled-agent/non-gimlet/config-rss.toml"); if rss_config_path.exists() { if let Ok(config) = SetupServiceConfig::from_file(rss_config_path) { From 02cb5396ed5e5f38562b239efea87c6d74e36268 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 May 2023 12:26:08 -0400 Subject: [PATCH 17/39] Use ledger in RSS --- common/src/lib.rs | 1 + sled-agent/src/bootstrap/agent.rs | 15 +++- sled-agent/src/bootstrap/hardware.rs | 10 ++- sled-agent/src/bootstrap/rss_handle.rs | 3 + sled-agent/src/ledger.rs | 99 ++++++++++++++--------- sled-agent/src/rack_setup/plan/service.rs | 82 +++++++++---------- sled-agent/src/rack_setup/plan/sled.rs | 79 +++++++++--------- sled-agent/src/rack_setup/service.rs | 88 ++++++++++++++------ sled-agent/src/services.rs | 52 +++++++----- 9 files changed, 258 insertions(+), 171 deletions(-) diff --git a/common/src/lib.rs b/common/src/lib.rs index db035c5eaa0..1d7c691e7af 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -56,4 +56,5 @@ macro_rules! generate_logging_api { /// /// NOTE: Be careful when modifying this path - the installation tools will /// **remove the entire directory** to re-install/uninstall the system. +// #[deprecated] pub const OMICRON_CONFIG_PATH: &'static str = "/var/oxide"; diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index b87043876b2..1a9c747e8ee 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -22,7 +22,7 @@ use crate::config::Config as SledConfig; use crate::server::Server as SledServer; use crate::services::ServiceManager; use crate::sp::SpHandle; -use crate::storage_manager::StorageManager; +use crate::storage_manager::{StorageManager, StorageResources}; use crate::updates::UpdateManager; use camino::{Utf8Path, Utf8PathBuf}; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; @@ -40,6 +40,7 @@ use omicron_common::api::external::Error as ExternalError; use omicron_common::backoff::{ retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; +use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use sled_hardware::underlay::BootstrapInterface; use sled_hardware::HardwareManager; @@ -193,6 +194,7 @@ pub struct Agent { share: Mutex>, sled_state: Mutex, + storage_resources: OnceCell, config: Config, sled_config: SledConfig, sp: Option, @@ -381,6 +383,7 @@ impl Agent { rss_access: Mutex::new(()), share: Mutex::new(None), sled_state: Mutex::new(SledAgentState::Before(None)), + storage_resources: OnceCell::new(), config: config.clone(), sled_config, sp, @@ -389,6 +392,12 @@ impl Agent { }; let hardware_monitor = agent.start_hardware_monitor().await?; + // TODO... can I make this less shite? + agent + .storage_resources + .set(hardware_monitor.storage().clone()) + .map_err(|_| "Failed to set storage") + .unwrap(); *agent.sled_state.lock().await = SledAgentState::Before(Some(hardware_monitor)); @@ -780,6 +789,10 @@ impl Agent { .as_ref() .map(|sp_config| sp_config.trust_quorum_members.clone()) .unwrap_or_default(), + self.storage_resources + .get() + .expect("Should set storage during initialization") + .clone(), ) .await?; Ok(()) diff --git a/sled-agent/src/bootstrap/hardware.rs b/sled-agent/src/bootstrap/hardware.rs index a99661aab4e..6c178642d2f 100644 --- a/sled-agent/src/bootstrap/hardware.rs +++ b/sled-agent/src/bootstrap/hardware.rs @@ -6,7 +6,7 @@ use crate::config::{Config as SledConfig, SledMode as SledModeConfig}; use crate::services::ServiceManager; -use crate::storage_manager::StorageManager; +use crate::storage_manager::{StorageManager, StorageResources}; use illumos_utils::dladm::{Etherstub, EtherstubVnic}; use sled_hardware::{DendriteAsic, HardwareManager, SledMode}; use slog::Logger; @@ -136,6 +136,7 @@ pub(crate) struct HardwareMonitor { handle: JoinHandle< Result<(HardwareManager, ServiceManager, StorageManager), Error>, >, + storage_resources: StorageResources, } impl HardwareMonitor { @@ -212,6 +213,7 @@ impl HardwareMonitor { storage: StorageManager, ) -> Self { let (exit_tx, exit_rx) = oneshot::channel(); + let storage_resources = storage.resources().clone(); let worker = HardwareMonitorWorker::new( log.clone(), exit_rx, @@ -221,7 +223,11 @@ impl HardwareMonitor { ); let handle = tokio::spawn(async move { worker.run().await }); - Self { exit_tx, handle } + Self { exit_tx, handle, storage_resources } + } + + pub fn storage(&self) -> &StorageResources { + &self.storage_resources } // Stops the task from executing diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index 1546bdf4d54..953425c5c5e 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -11,6 +11,7 @@ use crate::rack_setup::config::SetupServiceConfig; use crate::rack_setup::service::RackSetupService; use crate::rack_setup::service::SetupServiceError; use crate::sp::SpHandle; +use crate::storage_manager::StorageResources; use ::bootstrap_agent_client::Client as BootstrapAgentClient; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -49,12 +50,14 @@ impl RssHandle { our_bootstrap_address: Ipv6Addr, sp: Option, member_device_id_certs: Vec, + storage_resources: StorageResources, ) -> Result<(), SetupServiceError> { let (tx, rx) = rss_channel(our_bootstrap_address); let rss = RackSetupService::new( log.new(o!("component" => "RSS")), config, + storage_resources, tx, member_device_id_certs, ); diff --git a/sled-agent/src/ledger.rs b/sled-agent/src/ledger.rs index 84d686f8b4b..25ee7b604be 100644 --- a/sled-agent/src/ledger.rs +++ b/sled-agent/src/ledger.rs @@ -25,7 +25,10 @@ pub enum Error { }, #[error("Failed to write the ledger to storage")] - FailedToAccessStorage, + FailedToWrite, + + #[error("Not found in storage")] + NotFound, } impl Error { @@ -58,15 +61,27 @@ pub struct Ledger { } impl Ledger { + /// Creates a ledger with a new initial value, ready to be written to + /// `paths.` + pub fn new_with(log: &Logger, paths: Vec, default: T) -> Self { + Self { log: log.clone(), ledger: default, paths } + } + /// Reads the ledger from any of the provided `paths`. /// /// Returns the following, in order: /// - The ledger with the highest generation number /// - If none exists, returns a default ledger - pub async fn new( - log: &Logger, - paths: Vec, - ) -> Result { + pub async fn new(log: &Logger, paths: Vec) -> Option { + // Read the ledgers from storage + if let Some(ledger) = Self::read(log, &paths).await { + Some(Self { log: log.clone(), ledger, paths }) + } else { + None + } + } + + async fn read(log: &Logger, paths: &Vec) -> Option { // Read all the ledgers that we can. let mut ledgers = vec![]; for path in paths.iter() { @@ -83,11 +98,7 @@ impl Ledger { prior } }); - - // If we can't read either ledger, start a new one. - let ledger = ledger.unwrap_or_else(|| T::default()); - - Ok(Self { log: log.clone(), ledger, paths }) + ledger } pub fn data(&self) -> &T { @@ -115,7 +126,7 @@ impl Ledger { } if !one_successful_write { - return Err(Error::FailedToAccessStorage); + return Err(Error::FailedToWrite); } Ok(()) } @@ -143,9 +154,7 @@ impl Ledger { } #[async_trait] -pub trait Ledgerable: - Default + DeserializeOwned + Serialize + Send + Sync -{ +pub trait Ledgerable: DeserializeOwned + Serialize + Send + Sync { /// Returns true if [Self] is newer than `other`. fn is_newer_than(&self, other: &Self) -> bool; @@ -167,7 +176,7 @@ pub trait Ledgerable: }) } else { debug!(log, "No ledger in {path}"); - Ok(Self::default()) + Err(Error::NotFound) } } @@ -217,15 +226,21 @@ mod test { let log = &logctx.log; let config_dir = camino_tempfile::Utf8TempDir::new().unwrap(); - let ledger = - Ledger::::new(&log, vec![config_dir.path().to_path_buf()]) - .await - .expect("Failed to create ledger"); + let ledger = Ledger::::new_with( + &log, + vec![config_dir.path().to_path_buf()], + Data::default(), + ); // Since we haven't previously stored anything, expect to read a default // value. assert_eq!(ledger.data(), &Data::default()); + let ledger = + Ledger::::new(&log, vec![config_dir.path().to_path_buf()]) + .await; + assert!(ledger.is_none()); + logctx.cleanup_successful(); } @@ -238,9 +253,11 @@ mod test { let config_path = config_dir.path().join("ledger.toml"); // Create the ledger within a configuration directory - let mut ledger = Ledger::::new(&log, vec![config_path.clone()]) - .await - .expect("Failed to create ledger"); + let mut ledger = Ledger::::new_with( + &log, + vec![config_path.clone()], + Data::default(), + ); ledger.data_mut().contents = "new contents".to_string(); ledger.commit().await.expect("Failed to write ledger"); assert!(config_path.exists()); @@ -248,9 +265,8 @@ mod test { drop(ledger); // Re-create the ledger, observe the new contents. - let ledger = Ledger::::new(&log, vec![config_path.clone()]) - .await - .expect("Failed to create ledger"); + let ledger = + Ledger::::new(&log, vec![config_path.clone()]).await.unwrap(); assert_eq!(ledger.data().contents, "new contents"); assert_eq!(ledger.data().generation, 1); @@ -275,7 +291,7 @@ mod test { let mut ledger = Ledger::::new(&log, config_paths.clone()) .await - .expect("Failed to create ledger"); + .expect("Failed to read ledger"); ledger.data_mut().contents = "new contents".to_string(); ledger.commit().await.expect("Failed to write ledger"); @@ -287,7 +303,7 @@ mod test { // Let's write again, but only using one of the two config dirs. let mut ledger = Ledger::::new(&log, config_paths[..1].to_vec()) .await - .expect("Failed to create ledger"); + .expect("Failed to read ledger"); ledger.data_mut().contents = "even newer contents".to_string(); ledger.commit().await.expect("Failed to write ledger"); @@ -296,7 +312,7 @@ mod test { // Re-create the ledger (using both config dirs), observe the newest contents. let ledger = Ledger::::new(&log, config_paths.clone()) .await - .expect("Failed to create ledger"); + .expect("Failed to read ledger"); assert_eq!(ledger.data().contents, "even newer contents"); assert_eq!(ledger.data().generation, 2); @@ -319,9 +335,11 @@ mod test { .map(|d| d.path().join("ledger.toml")) .collect::>(); - let mut ledger = Ledger::::new(&log, config_paths.clone()) - .await - .expect("Failed to create ledger"); + let mut ledger = Ledger::::new_with( + &log, + config_paths.clone(), + Data::default(), + ); ledger.data_mut().contents = "written to both configs".to_string(); ledger.commit().await.expect("Failed to write ledger"); @@ -339,7 +357,7 @@ mod test { let mut ledger = Ledger::::new(&log, config_paths.clone()) .await - .expect("Failed to create ledger"); + .expect("Failed to read ledger"); assert_eq!(ledger.data().contents, "written to both configs"); assert_eq!(ledger.data().generation, 1); @@ -351,7 +369,7 @@ mod test { // We can still parse the ledger from a single path let ledger = Ledger::::new(&log, config_paths.clone()) .await - .expect("Failed to create ledger"); + .expect("Failed to read ledger"); assert_eq!(ledger.data().contents, "written to one config"); assert_eq!(ledger.data().generation, 2); @@ -364,16 +382,17 @@ mod test { assert!(!config_paths[0].exists()); assert!(!config_paths[1].exists()); - let mut ledger = Ledger::::new(&log, config_paths.clone()) - .await - .expect("Failed to create ledger"); + let ledger = Ledger::::new(&log, config_paths.clone()).await; + assert!(ledger.is_none()); + let mut ledger = Ledger::::new_with( + &log, + config_paths.clone(), + Data::default(), + ); assert_eq!(ledger.data(), &Data::default()); let err = ledger.commit().await.unwrap_err(); - assert!( - matches!(err, Error::FailedToAccessStorage), - "Unexpected error: {err}" - ); + assert!(matches!(err, Error::FailedToWrite), "Unexpected error: {err}"); logctx.cleanup_successful(); } diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index f9fe592ee52..2d1f7a01f3f 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -5,12 +5,14 @@ //! Plan generation for "where should services be initialized". use crate::bootstrap::params::SledAgentRequest; +use crate::ledger::{Ledger, Ledgerable}; use crate::params::{ DatasetEnsureBody, ServiceType, ServiceZoneRequest, ServiceZoneService, ZoneType, }; use crate::rack_setup::config::SetupServiceConfig as Config; -use camino::{Utf8Path, Utf8PathBuf}; +use crate::storage_manager::StorageResources; +use camino::Utf8PathBuf; use dns_service_client::types::DnsConfigParams; use internal_dns::{ServiceName, DNS_ZONE}; use omicron_common::address::{ @@ -61,11 +63,6 @@ const PANTRY_COUNT: usize = 1; // when Nexus provisions external DNS zones. const EXTERNAL_DNS_COUNT: usize = 1; -fn rss_service_plan_path() -> Utf8PathBuf { - Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join("rss-service-plan.toml") -} - /// Describes errors which may occur while generating a plan for services. #[derive(Error, Debug)] pub enum PlanError { @@ -76,8 +73,8 @@ pub enum PlanError { err: std::io::Error, }, - #[error("Cannot deserialize TOML file at {path}: {err}")] - Toml { path: Utf8PathBuf, err: toml::de::Error }, + #[error("Failed to access ledger: {0}")] + Ledger(#[from] crate::ledger::Error), #[error("Error making HTTP request to Sled Agent: {0}")] SledApi(#[from] SledAgentError), @@ -103,35 +100,39 @@ pub struct SledRequest { pub services: Vec, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct Plan { pub services: HashMap, pub dns_config: DnsConfigParams, } +impl Ledgerable for Plan { + fn is_newer_than(&self, _other: &Self) -> bool { + true + } + fn generation_bump(&mut self) {} +} +const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan.toml"; + impl Plan { - pub async fn load(log: &Logger) -> Result, PlanError> { + pub async fn load( + log: &Logger, + storage: &StorageResources, + ) -> Result, PlanError> { + let paths: Vec = storage + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(RSS_SERVICE_PLAN_FILENAME)) + .collect(); + // If we already created a plan for this RSS to allocate // services to sleds, re-use that existing plan. - let rss_service_plan_path = rss_service_plan_path(); - if rss_service_plan_path.exists() { - info!(log, "RSS plan already created, loading from file"); + let ledger = Ledger::::new(log, paths.clone()).await; - let plan: Self = toml::from_str( - &tokio::fs::read_to_string(&rss_service_plan_path) - .await - .map_err(|err| PlanError::Io { - message: format!( - "Loading RSS plan {rss_service_plan_path:?}" - ), - err, - })?, - ) - .map_err(|err| PlanError::Toml { - path: rss_service_plan_path, - err, - })?; - Ok(Some(plan)) + if let Some(ledger) = ledger { + info!(log, "RSS plan already created, loading from file"); + Ok(Some(ledger.data().clone())) } else { Ok(None) } @@ -226,6 +227,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, + storage: &StorageResources, sleds: &HashMap, ) -> Result { let reserved_rack_subnet = ReservedRackSubnet::new(config.az_subnet()); @@ -573,23 +575,15 @@ impl Plan { let plan = Self { services, dns_config }; // Once we've constructed a plan, write it down to durable storage. - let serialized_plan = - toml::Value::try_from(&plan).unwrap_or_else(|e| { - panic!("Cannot serialize configuration: {:#?}: {}", plan, e) - }); - let plan_str = toml::to_string(&serialized_plan) - .expect("Cannot turn config to string"); - - info!(log, "Plan serialized as: {}", plan_str); - let path = rss_service_plan_path(); - tokio::fs::write(&path, plan_str).await.map_err(|err| { - PlanError::Io { - message: format!("Storing RSS service plan to {path:?}"), - err, - } - })?; + let paths: Vec = storage + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(RSS_SERVICE_PLAN_FILENAME)) + .collect(); + let mut ledger = Ledger::::new_with(log, paths, plan.clone()); + ledger.commit().await?; info!(log, "Service plan written to storage"); - Ok(plan) } } diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 41c839c0e11..42b897c77ea 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -9,8 +9,10 @@ use crate::bootstrap::{ params::SledAgentRequest, trust_quorum::{RackSecret, ShareDistribution}, }; +use crate::ledger::{Ledger, Ledgerable}; use crate::rack_setup::config::SetupServiceConfig as Config; -use camino::{Utf8Path, Utf8PathBuf}; +use crate::storage_manager::StorageResources; +use camino::Utf8PathBuf; use serde::{Deserialize, Serialize}; use slog::Logger; use sprockets_host::Ed25519Certificate; @@ -19,11 +21,6 @@ use std::net::{Ipv6Addr, SocketAddrV6}; use thiserror::Error; use uuid::Uuid; -fn rss_sled_plan_path() -> Utf8PathBuf { - Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join("rss-sled-plan.toml") -} - pub fn generate_rack_secret<'a>( rack_secret_threshold: usize, member_device_id_certs: &'a [Ed25519Certificate], @@ -75,14 +72,22 @@ pub enum PlanError { err: std::io::Error, }, - #[error("Cannot deserialize TOML file at {path}: {err}")] - Toml { path: Utf8PathBuf, err: toml::de::Error }, + #[error("Failed to access ledger: {0}")] + Ledger(#[from] crate::ledger::Error), #[error("Failed to split rack secret: {0:?}")] SplitRackSecret(vsss_rs::Error), } -#[derive(Debug, Serialize, Deserialize)] +impl Ledgerable for Plan { + fn is_newer_than(&self, _other: &Self) -> bool { + true + } + fn generation_bump(&mut self) {} +} +const RSS_SLED_PLAN_FILENAME: &str = "rss-sled-plan.toml"; + +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct Plan { pub rack_id: Uuid, pub sleds: HashMap, @@ -93,25 +98,23 @@ pub struct Plan { } impl Plan { - pub async fn load(log: &Logger) -> Result, PlanError> { + pub async fn load( + log: &Logger, + storage: &StorageResources, + ) -> Result, PlanError> { + let paths: Vec = storage + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(RSS_SLED_PLAN_FILENAME)) + .collect(); + // If we already created a plan for this RSS to allocate // subnets/requests to sleds, re-use that existing plan. - let rss_sled_plan_path = rss_sled_plan_path(); - if rss_sled_plan_path.exists() { + let ledger = Ledger::::new(log, paths.clone()).await; + if let Some(ledger) = ledger { info!(log, "RSS plan already created, loading from file"); - - let plan: Self = toml::from_str( - &tokio::fs::read_to_string(&rss_sled_plan_path).await.map_err( - |err| PlanError::Io { - message: format!( - "Loading RSS plan {rss_sled_plan_path:?}" - ), - err, - }, - )?, - ) - .map_err(|err| PlanError::Toml { path: rss_sled_plan_path, err })?; - Ok(Some(plan)) + Ok(Some(ledger.data().clone())) } else { Ok(None) } @@ -120,6 +123,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, + storage: &StorageResources, bootstrap_addrs: HashSet, ) -> Result { let rack_id = Uuid::new_v4(); @@ -159,23 +163,16 @@ impl Plan { let plan = Self { rack_id, sleds, config: config.clone() }; // Once we've constructed a plan, write it down to durable storage. - let serialized_plan = - toml::Value::try_from(&plan).unwrap_or_else(|e| { - panic!("Cannot serialize configuration: {:#?}: {}", plan, e) - }); - let plan_str = toml::to_string(&serialized_plan) - .expect("Cannot turn config to string"); - - info!(log, "Plan serialized as: {}", plan_str); - let path = rss_sled_plan_path(); - tokio::fs::write(&path, plan_str).await.map_err(|err| { - PlanError::Io { - message: format!("Storing RSS sled plan to {path:?}"), - err, - } - })?; + let paths: Vec = storage + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(RSS_SLED_PLAN_FILENAME)) + .collect(); + + let mut ledger = Ledger::::new_with(log, paths, plan.clone()); + ledger.commit().await?; info!(log, "Sled plan written to storage"); - Ok(plan) } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index de5e8f99536..ec1bdc67e98 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -59,6 +59,7 @@ use crate::bootstrap::config::BOOTSTRAP_AGENT_HTTP_PORT; use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::SledAgentRequest; use crate::bootstrap::rss_handle::BootstrapAgentHandle; +use crate::ledger::{Ledger, Ledgerable}; use crate::nexus::d2n_params; use crate::params::{ AutonomousServiceOnlyError, DatasetEnsureBody, ServiceType, @@ -70,7 +71,8 @@ use crate::rack_setup::plan::service::{ use crate::rack_setup::plan::sled::{ generate_rack_secret, Plan as SledPlan, PlanError as SledPlanError, }; -use camino::{Utf8Path, Utf8PathBuf}; +use crate::storage_manager::StorageResources; +use camino::Utf8PathBuf; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use internal_dns::resolver::{DnsError, Resolver as DnsResolver}; use internal_dns::ServiceName; @@ -106,6 +108,9 @@ pub enum SetupServiceError { err: std::io::Error, }, + #[error("Failed to access ledger: {0}")] + Ledger(#[from] crate::ledger::Error), + #[error("Cannot create plan for sled services: {0}")] ServicePlan(#[from] ServicePlanError), @@ -165,6 +170,7 @@ impl RackSetupService { pub(crate) fn new( log: Logger, config: Config, + storage_resources: StorageResources, local_bootstrap_agent: BootstrapAgentHandle, // TODO-cleanup: We should be collecting the device ID certs of all // trust quorum members over the management network. Currently we don't @@ -175,7 +181,12 @@ impl RackSetupService { let handle = tokio::task::spawn(async move { let svc = ServiceInner::new(log.clone()); if let Err(e) = svc - .run(&config, local_bootstrap_agent, &member_device_id_certs) + .run( + &config, + &storage_resources, + local_bootstrap_agent, + &member_device_id_certs, + ) .await { warn!(log, "RSS injection failed: {}", e); @@ -211,10 +222,16 @@ impl RackSetupService { } } -fn rss_completed_marker_path() -> Utf8PathBuf { - Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join("rss-plan-completed.marker") +#[derive(Clone, Serialize, Deserialize, Default)] +struct RssCompleteMarker {} + +impl Ledgerable for RssCompleteMarker { + fn is_newer_than(&self, _other: &Self) -> bool { + true + } + fn generation_bump(&mut self) {} } +const RSS_COMPLETED_FILENAME: &str = "rss-plan-completed.marker"; /// The implementation of the Rack Setup Service. struct ServiceInner { @@ -815,16 +832,27 @@ impl ServiceInner { async fn run( &self, config: &Config, + storage_resources: &StorageResources, local_bootstrap_agent: BootstrapAgentHandle, member_device_id_certs: &[Ed25519Certificate], ) -> Result<(), SetupServiceError> { info!(self.log, "Injecting RSS configuration: {:#?}", config); + let marker_paths: Vec = storage_resources + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(RSS_COMPLETED_FILENAME)) + .collect(); + + let ledger = + Ledger::::new(&self.log, marker_paths.clone()) + .await; + // Check if a previous RSS plan has completed successfully. // // If it has, the system should be up-and-running. - let rss_completed_marker_path = rss_completed_marker_path(); - if rss_completed_marker_path.exists() { + if ledger.is_some() { // TODO(https://github.com/oxidecomputer/omicron/issues/724): If the // running configuration doesn't match Config, we could try to // update things. @@ -833,7 +861,7 @@ impl ServiceInner { "RSS configuration looks like it has already been applied", ); - let sled_plan = SledPlan::load(&self.log) + let sled_plan = SledPlan::load(&self.log, storage_resources) .await? .expect("Sled plan should exist if completed marker exists"); if &sled_plan.config != config { @@ -841,7 +869,7 @@ impl ServiceInner { "Configuration changed".to_string(), )); } - let service_plan = ServicePlan::load(&self.log) + let service_plan = ServicePlan::load(&self.log, storage_resources) .await? .expect("Service plan should exist if completed marker exists"); self.handoff_to_nexus(&config, &sled_plan, &service_plan).await?; @@ -859,7 +887,8 @@ impl ServiceInner { } BootstrapAddressDiscovery::OnlyThese { addrs } => addrs.clone(), }; - let maybe_sled_plan = SledPlan::load(&self.log).await?; + let maybe_sled_plan = + SledPlan::load(&self.log, storage_resources).await?; if let Some(plan) = &maybe_sled_plan { let stored_peers: HashSet = plan.sleds.keys().map(|a| *a.ip()).collect(); @@ -884,7 +913,13 @@ impl ServiceInner { plan } else { info!(self.log, "Creating new allocation plan"); - SledPlan::create(&self.log, config, bootstrap_addrs).await? + SledPlan::create( + &self.log, + config, + &storage_resources, + bootstrap_addrs, + ) + .await? }; let config = &plan.config; @@ -941,12 +976,19 @@ impl ServiceInner { get_sled_address(initialization_request.subnet) }) .collect(); - let service_plan = - if let Some(plan) = ServicePlan::load(&self.log).await? { - plan - } else { - ServicePlan::create(&self.log, &config, &plan.sleds).await? - }; + let service_plan = if let Some(plan) = + ServicePlan::load(&self.log, storage_resources).await? + { + plan + } else { + ServicePlan::create( + &self.log, + &config, + &storage_resources, + &plan.sleds, + ) + .await? + }; // Set up internal DNS services first and write the initial // DNS configuration to the internal DNS servers. @@ -1034,12 +1076,12 @@ impl ServiceInner { info!(self.log, "Finished setting up services"); // Finally, mark that we've completed executing the plans. - tokio::fs::File::create(&rss_completed_marker_path).await.map_err( - |err| SetupServiceError::Io { - message: format!("creating {rss_completed_marker_path:?}"), - err, - }, - )?; + let mut ledger = Ledger::::new_with( + &self.log, + marker_paths.clone(), + RssCompleteMarker::default(), + ); + ledger.commit().await?; // At this point, even if we reboot, we must not try to manage sleds, // services, or DNS records. diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index eba8a9a00f4..360690bb9a7 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -436,15 +436,14 @@ impl ServiceManager { pub async fn load_non_storage_services(&self) -> Result<(), Error> { let log = &self.inner.log; let mut existing_zones = self.inner.zones.lock().await; - let ledger = Ledger::::new( + let Some(ledger) = Ledger::::new( log, self.all_service_ledgers().await, ) - .await?; - let services = ledger.data(); - if services.requests.is_empty() { + .await else { return Ok(()); - } + }; + let services = ledger.data(); // Initialize and DNS and NTP services first as they are required // for time synchronization, which is a pre-requisite for the other @@ -513,15 +512,14 @@ impl ServiceManager { pub async fn load_storage_services(&self) -> Result<(), Error> { let log = &self.inner.log; let mut existing_zones = self.inner.dataset_zones.lock().await; - let ledger = Ledger::::new( + let Some(ledger) = Ledger::::new( log, self.all_storage_service_ledgers().await, ) - .await?; - let services = ledger.data(); - if services.requests.is_empty() { + .await else { return Ok(()); - } + }; + let services = ledger.data(); self.initialize_services_locked( &mut existing_zones, &services.requests, @@ -1787,11 +1785,18 @@ impl ServiceManager { let mut existing_zones = self.inner.zones.lock().await; // Read the existing set of services from the ledger. - let mut ledger = Ledger::::new( - log, - self.all_service_ledgers().await, - ) - .await?; + let service_paths = self.all_service_ledgers().await; + let mut ledger = + match Ledger::::new(log, service_paths.clone()) + .await + { + Some(ledger) => ledger, + None => Ledger::::new_with( + log, + service_paths.clone(), + AllZoneRequests::default(), + ), + }; let ledger_zone_requests = ledger.data_mut(); let new_zone_requests: Vec = { @@ -1852,11 +1857,18 @@ impl ServiceManager { let mut existing_zones = self.inner.dataset_zones.lock().await; // Read the existing set of services from the ledger. - let mut ledger = Ledger::::new( - log, - self.all_storage_service_ledgers().await, - ) - .await?; + let service_paths = self.all_storage_service_ledgers().await; + let mut ledger = + match Ledger::::new(log, service_paths.clone()) + .await + { + Some(ledger) => ledger, + None => Ledger::::new_with( + log, + service_paths.clone(), + AllZoneRequests::default(), + ), + }; let ledger_zone_requests = ledger.data_mut(); if !ledger_zone_requests From ced5dc92aa41c74da6a846aaa9cd59d20d9c7843 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 May 2023 15:09:04 -0400 Subject: [PATCH 18/39] review feedback --- illumos-utils/src/running_zone.rs | 6 +++--- sled-agent/src/config.rs | 10 ++++------ sled-agent/src/services.rs | 10 ++++------ sled-agent/src/updates.rs | 12 ++++-------- 4 files changed, 15 insertions(+), 23 deletions(-) diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 31c7d0a9729..9676e5a48f2 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -146,8 +146,8 @@ impl RunningZone { } /// Returns the filesystem path to the zone's root - pub fn root(&self) -> String { - format!("{}/root", self.inner.zonepath) + pub fn root(&self) -> Utf8PathBuf { + self.inner.zonepath.join("root") } /// Runs a command within the Zone, return the output. @@ -641,7 +641,7 @@ impl InstalledZone { let full_zone_name = Self::get_zone_name(zone_type, unique_name); let zone_image_path = - Utf8PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type)); + Utf8PathBuf::from(format!("/opt/oxide/{}.tar.gz", zone_type)); let net_device_names: Vec = opte_ports .iter() diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index e5bb8aa1ba6..e98f9f9a71e 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -127,17 +127,15 @@ mod test { let smf = Utf8PathBuf::from(manifest).join("../smf/sled-agent"); let mut configs_seen = 0; - for variant in std::fs::read_dir(smf).unwrap() { + for variant in smf.read_dir_utf8().unwrap() { let variant = variant.unwrap(); if variant.file_type().unwrap().is_dir() { - for entry in std::fs::read_dir(variant.path()).unwrap() { + for entry in variant.path().read_dir_utf8().unwrap() { let entry = entry.unwrap(); if entry.file_name() == "config.toml" { let path = entry.path(); - let utf8_path: &Utf8Path = - path.as_path().try_into().unwrap(); - Config::from_file(&utf8_path).unwrap_or_else(|_| { - panic!("Failed to parse config {utf8_path}") + Config::from_file(&path).unwrap_or_else(|_| { + panic!("Failed to parse config {path}") }); configs_seen += 1; } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index eba8a9a00f4..03b6812af6e 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1906,13 +1906,11 @@ impl ServiceManager { info!(self.inner.log, "Setting boot time to {:?}", now); - let files: Vec = zones + let files: Vec = zones .iter() .map(|z| z.root()) - .chain(iter::once("".to_string())) - .flat_map(|r| { - [format!("{r}/var/adm/utmpx"), format!("{r}/var/adm/wtmpx")] - }) + .chain(iter::once(Utf8PathBuf::from("/"))) + .flat_map(|r| [r.join("var/adm/utmpx"), r.join("var/adm/wtmpx")]) .collect(); for file in files { @@ -1920,7 +1918,7 @@ impl ServiceManager { let cmd = command.args(&[ "/usr/platform/oxide/bin/tmpx", &format!("{}", now.as_secs()), - &file, + &file.as_str(), ]); match execute(cmd) { Err(e) => { diff --git a/sled-agent/src/updates.rs b/sled-agent/src/updates.rs index 43eeac17fcd..6d837700e95 100644 --- a/sled-agent/src/updates.rs +++ b/sled-agent/src/updates.rs @@ -219,23 +219,19 @@ impl UpdateManager { let mut components = vec![]; let dir = &self.config.zone_artifact_path; - for entry in std::fs::read_dir(dir).map_err(|err| io_err(dir, err))? { + for entry in dir.read_dir_utf8().map_err(|err| io_err(dir, err))? { let entry = entry.map_err(|err| io_err(dir, err))?; let file_type = entry.file_type().map_err(|err| io_err(dir, err))?; - let path: Utf8PathBuf = entry.path().try_into()?; + let path = entry.path(); - if file_type.is_file() - && entry.file_name().to_string_lossy().ends_with(".tar.gz") - { + if file_type.is_file() && entry.file_name().ends_with(".tar.gz") { // Zone Images are currently identified as individual components. // // This logic may be tweaked in the future, depending on how we // bundle together zones. components.push(self.component_get_zone_version(&path).await?); - } else if file_type.is_dir() - && entry.file_name().to_string_lossy() == "sled-agent" - { + } else if file_type.is_dir() && entry.file_name() == "sled-agent" { // Sled Agent is the only non-zone file recognized as a component. let version_path = path.join("VERSION"); let version = tokio::fs::read_to_string(&version_path) From db4bd5180b7096f708a78b0e765e3fa68d044b31 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 May 2023 15:13:23 -0400 Subject: [PATCH 19/39] clippy --- installinator/src/write.rs | 2 -- sled-agent/src/services.rs | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/installinator/src/write.rs b/installinator/src/write.rs index 980e0740869..487754b7bd6 100644 --- a/installinator/src/write.rs +++ b/installinator/src/write.rs @@ -96,8 +96,6 @@ impl WriteDestination { match disk.boot_image_devfs_path(raw_devfs_path) { Ok(path) => { - let path = Utf8PathBuf::try_from(path) - .context("non-UTF8 drive path")?; info!( log, "found target M.2 disk"; "identity" => ?disk.identity(), diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 03b6812af6e..32e8474f092 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -862,7 +862,7 @@ impl ServiceManager { let service = DnsClient {}; let smfh = SmfHelper::new(&running_zone, &service); - let etc = Utf8PathBuf::from(running_zone.root()).join("etc"); + let etc = running_zone.root().join("etc"); let resolv_conf = etc.join("resolv.conf"); let nsswitch_conf = etc.join("nsswitch.conf"); let nsswitch_dns = etc.join("nsswitch.dns"); From c5947b60c587361f39d20e717fc69b96247047c9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 May 2023 16:50:00 -0400 Subject: [PATCH 20/39] Use the ledger for sled-agent-request --- common/src/lib.rs | 10 -- package/src/bin/omicron-package.rs | 9 -- sled-agent/src/bootstrap/agent.rs | 203 ++++++++++++++--------------- 3 files changed, 97 insertions(+), 125 deletions(-) diff --git a/common/src/lib.rs b/common/src/lib.rs index 1d7c691e7af..c7ca10128e8 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -48,13 +48,3 @@ macro_rules! generate_logging_api { ); }; } - -/// Location on internal storage where sled-specific information is stored. -/// -/// This is mostly private to the `omicron-sled-agent` crate, but exists in -/// common so it may be cleared by the installation tools. -/// -/// NOTE: Be careful when modifying this path - the installation tools will -/// **remove the entire directory** to re-install/uninstall the system. -// #[deprecated] -pub const OMICRON_CONFIG_PATH: &'static str = "/var/oxide"; diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index 35746e1b49d..b5f53548312 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -606,13 +606,6 @@ fn uninstall_all_packages(config: &Config) { } } -fn uninstall_omicron_config() { - // Once all packages have been removed, also remove any locally-stored - // configuration. - remove_all_unless_already_removed(omicron_common::OMICRON_CONFIG_PATH) - .unwrap(); -} - fn remove_file_unless_already_removed>(path: P) -> Result<()> { if let Err(e) = std::fs::remove_file(path.as_ref()) { match e.kind() { @@ -671,8 +664,6 @@ async fn do_deactivate(config: &Config) -> Result<()> { async fn do_uninstall(config: &Config) -> Result<()> { do_deactivate(config).await?; - info!(config.log, "Uninstalling Omicron configuration"); - uninstall_omicron_config(); info!(config.log, "Removing datasets"); uninstall_all_omicron_datasets(config)?; Ok(()) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 1a9c747e8ee..fdc8ce73ecf 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -19,12 +19,13 @@ use super::trust_quorum::{ }; use super::views::SledAgentResponse; use crate::config::Config as SledConfig; +use crate::ledger::{Ledger, Ledgerable}; use crate::server::Server as SledServer; use crate::services::ServiceManager; use crate::sp::SpHandle; use crate::storage_manager::{StorageManager, StorageResources}; use crate::updates::UpdateManager; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8PathBuf; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use futures::stream::{self, StreamExt, TryStreamExt}; use illumos_utils::addrobj::AddrObject; @@ -74,12 +75,12 @@ pub enum BootstrapError { #[error("Error monitoring hardware: {0}")] Hardware(#[from] crate::bootstrap::hardware::Error), + #[error("Failed to access ledger: {0}")] + Ledger(#[from] crate::ledger::Error), + #[error("Error managing sled agent: {0}")] SledError(String), - #[error("Error deserializing toml from {path}: {err}")] - Toml { path: Utf8PathBuf, err: toml::de::Error }, - #[error(transparent)] TrustQuorum(#[from] TrustQuorumError), @@ -203,10 +204,7 @@ pub struct Agent { global_zone_bootstrap_link_local_address: Ipv6Addr, } -fn get_sled_agent_request_path() -> Utf8PathBuf { - Utf8Path::new(omicron_common::OMICRON_CONFIG_PATH) - .join("sled-agent-request.toml") -} +const SLED_AGENT_REQUEST_FILE: &str = "sled-agent-request.toml"; // Deletes all state which may be left-over from a previous execution of the // Sled Agent. @@ -263,6 +261,15 @@ async fn cleanup_all_old_global_state( Ok(()) } +async fn sled_config_paths(storage: &StorageResources) -> Vec { + storage + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .await + .into_iter() + .map(|p| p.join(SLED_AGENT_REQUEST_FILE)) + .collect() +} + impl Agent { pub async fn new( log: Logger, @@ -276,24 +283,7 @@ impl Agent { let link = config.link.clone(); let ip = BootstrapInterface::GlobalZone.ip(&link)?; - // We expect this directory to exist - ensure that it does, before any - // subsequent operations which may write configs here. - info!( - log, "Ensuring config directory exists"; - "path" => omicron_common::OMICRON_CONFIG_PATH, - ); - tokio::fs::create_dir_all(omicron_common::OMICRON_CONFIG_PATH) - .await - .map_err(|err| BootstrapError::Io { - message: format!( - "Creating config directory {}", - omicron_common::OMICRON_CONFIG_PATH - ), - err, - })?; - let bootstrap_etherstub = bootstrap_etherstub()?; - let bootstrap_etherstub_vnic = Dladm::ensure_etherstub_vnic( &bootstrap_etherstub, ) @@ -401,23 +391,17 @@ impl Agent { *agent.sled_state.lock().await = SledAgentState::Before(Some(hardware_monitor)); - let request_path = get_sled_agent_request_path(); - let trust_quorum = if request_path.exists() { + let paths = + sled_config_paths(&agent.storage_resources.get().unwrap()).await; + let trust_quorum = if let Some(ledger) = + Ledger::::new(&agent.log, paths).await + { info!(agent.log, "Sled already configured, loading sled agent"); - let sled_request: PersistentSledAgentRequest = toml::from_str( - &tokio::fs::read_to_string(&request_path).await.map_err( - |err| BootstrapError::Io { - message: format!( - "Reading subnet path from {request_path:?}" - ), - err, - }, - )?, - ) - .map_err(|err| BootstrapError::Toml { path: request_path, err })?; - - let trust_quorum_share = - sled_request.trust_quorum_share.map(ShareDistribution::from); + let sled_request = ledger.data(); + let trust_quorum_share = sled_request + .trust_quorum_share + .clone() + .map(ShareDistribution::from); agent .request_agent(&sled_request.request, &trust_quorum_share) .await?; @@ -553,27 +537,20 @@ impl Agent { // Record this request so the sled agent can be automatically // initialized on the next boot. - // - // danger handling: `serialized_request` contains our trust quorum - // share; we do not log it and only write it to the designated path. - let serialized_request = PersistentSledAgentRequest { - request: Cow::Borrowed(request), - trust_quorum_share: trust_quorum_share - .clone() - .map(Into::into), - } - .danger_serialize_as_toml() - .expect("Cannot serialize request"); - - let path = get_sled_agent_request_path(); - tokio::fs::write(&path, &serialized_request).await.map_err( - |err| BootstrapError::Io { - message: format!( - "Recording Sled Agent request to {path:?}" - ), - err, + let paths = + sled_config_paths(&self.storage_resources.get().unwrap()) + .await; + let mut ledger = Ledger::new_with( + &self.log, + paths, + PersistentSledAgentRequest { + request: Cow::Borrowed(request), + trust_quorum_share: trust_quorum_share + .clone() + .map(Into::into), }, - )?; + ); + ledger.commit().await?; // This is the point-of-no-return, where we're committed to the // sled agent starting. @@ -840,28 +817,41 @@ impl Agent { &self, _state: &tokio::sync::MutexGuard<'_, SledAgentState>, ) -> Result<(), BootstrapError> { - tokio::fs::remove_dir_all(omicron_common::OMICRON_CONFIG_PATH) - .await - .or_else(|err| match err.kind() { - std::io::ErrorKind::NotFound => Ok(()), - _ => Err(err), - }) - .map_err(|err| BootstrapError::Io { - message: format!( - "Deleting {}", - omicron_common::OMICRON_CONFIG_PATH - ), - err, - })?; - tokio::fs::create_dir_all(omicron_common::OMICRON_CONFIG_PATH) + let config_dirs = self + .storage_resources + .get() + .unwrap() + .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) .await - .map_err(|err| BootstrapError::Io { - message: format!( - "Creating config directory {}", - omicron_common::OMICRON_CONFIG_PATH - ), - err, - })?; + .into_iter(); + + for dir in config_dirs { + for entry in dir.read_dir_utf8().map_err(|err| { + BootstrapError::Io { message: format!("Deleting {dir}"), err } + })? { + let entry = entry.map_err(|err| BootstrapError::Io { + message: format!("Deleting {dir}"), + err, + })?; + + let path = entry.path(); + let file_type = + entry.file_type().map_err(|err| BootstrapError::Io { + message: format!("Deleting {path}"), + err, + })?; + + if file_type.is_dir() { + tokio::fs::remove_dir_all(path).await + } else { + tokio::fs::remove_file(path).await + } + .map_err(|err| BootstrapError::Io { + message: format!("Deleting {path}"), + err, + })?; + } + } Ok(()) } @@ -978,40 +968,34 @@ impl Agent { } } -// We intentionally DO NOT derive `Debug` or `Serialize`; both provide avenues +// We intentionally DO NOT derive `Debug``; it provides avenues // by which we may accidentally log the contents of our trust quorum share. -#[derive(Deserialize, PartialEq)] +#[derive(Clone, Serialize, Deserialize, PartialEq)] struct PersistentSledAgentRequest<'a> { request: Cow<'a, SledAgentRequest>, trust_quorum_share: Option, } -impl PersistentSledAgentRequest<'_> { - /// On success, the returned string will contain our raw - /// `trust_quorum_share`. This method is named `danger_*` to remind the - /// caller that they must not log this string. - fn danger_serialize_as_toml(&self) -> Result { - #[derive(Serialize)] - #[serde(remote = "PersistentSledAgentRequest")] - struct PersistentSledAgentRequestDef<'a> { - request: Cow<'a, SledAgentRequest>, - trust_quorum_share: Option, - } - - let mut out = String::with_capacity(128); - let serializer = toml::Serializer::new(&mut out); - PersistentSledAgentRequestDef::serialize(self, serializer)?; - Ok(out) +impl<'a> Ledgerable for PersistentSledAgentRequest<'a> { + fn is_newer_than(&self, _other: &Self) -> bool { + true } + fn generation_bump(&mut self) {} } #[cfg(test)] mod tests { use super::*; + use omicron_test_utils::dev::test_setup_log; use uuid::Uuid; - #[test] - fn persistent_sled_agent_request_serialization_round_trips() { + #[tokio::test] + async fn persistent_sled_agent_request_serialization() { + let logctx = + test_setup_log("persistent_sled_agent_request_serialization"); + let log = &logctx.log; + + // No secret generated if we have <= 1 sled let secret = RackSecret::new(); let (mut shares, verifier) = secret.split(2, 4).unwrap(); @@ -1034,10 +1018,17 @@ mod tests { ), }; - let serialized = request.danger_serialize_as_toml().unwrap(); - let deserialized: PersistentSledAgentRequest = - toml::from_str(&serialized).unwrap(); + let tempdir = camino_tempfile::Utf8TempDir::new().unwrap(); + let paths = vec![tempdir.path().join("test-file")]; + + let mut ledger = Ledger::new_with(log, paths.clone(), request.clone()); + ledger.commit().await.expect("Failed to write to ledger"); + + let ledger = Ledger::::new(log, paths) + .await + .expect("Failt to read request"); - assert!(request == deserialized, "serialization round trip failed"); + assert!(&request == ledger.data(), "serialization round trip failed"); + logctx.cleanup_successful(); } } From 884f6d5e2a46a37c23b6c78547a4a1e1933acb9c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 May 2023 17:06:40 -0400 Subject: [PATCH 21/39] better bootstrap agent init --- sled-agent/src/bootstrap/agent.rs | 66 +++++++++++++++++-------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index fdc8ce73ecf..e305d84885d 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -41,7 +41,6 @@ use omicron_common::api::external::Error as ExternalError; use omicron_common::backoff::{ retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; -use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use sled_hardware::underlay::BootstrapInterface; use sled_hardware::HardwareManager; @@ -195,7 +194,7 @@ pub struct Agent { share: Mutex>, sled_state: Mutex, - storage_resources: OnceCell, + storage_resources: StorageResources, config: Config, sled_config: SledConfig, sp: Option, @@ -366,14 +365,26 @@ impl Agent { // the switch zone. info!(log, "Bootstrap Agent monitoring for hardware"); + let hardware_monitor = Self::hardware_monitor( + &ba_log, + &config.link, + &sled_config, + global_zone_bootstrap_link_local_address, + ) + .await?; + + let storage_resources = hardware_monitor.storage().clone(); + let agent = Agent { log: ba_log, parent_log: log, ip, rss_access: Mutex::new(()), share: Mutex::new(None), - sled_state: Mutex::new(SledAgentState::Before(None)), - storage_resources: OnceCell::new(), + sled_state: Mutex::new(SledAgentState::Before(Some( + hardware_monitor, + ))), + storage_resources, config: config.clone(), sled_config, sp, @@ -381,18 +392,7 @@ impl Agent { global_zone_bootstrap_link_local_address, }; - let hardware_monitor = agent.start_hardware_monitor().await?; - // TODO... can I make this less shite? - agent - .storage_resources - .set(hardware_monitor.storage().clone()) - .map_err(|_| "Failed to set storage") - .unwrap(); - *agent.sled_state.lock().await = - SledAgentState::Before(Some(hardware_monitor)); - - let paths = - sled_config_paths(&agent.storage_resources.get().unwrap()).await; + let paths = sled_config_paths(&agent.storage_resources).await; let trust_quorum = if let Some(ledger) = Ledger::::new(&agent.log, paths).await { @@ -415,17 +415,32 @@ impl Agent { async fn start_hardware_monitor( &self, + ) -> Result { + Self::hardware_monitor( + &self.log, + &self.config.link, + &self.sled_config, + self.global_zone_bootstrap_link_local_address, + ) + .await + } + + async fn hardware_monitor( + log: &Logger, + link: &illumos_utils::dladm::PhysicalLink, + sled_config: &SledConfig, + global_zone_bootstrap_link_local_address: Ipv6Addr, ) -> Result { let underlay_etherstub = underlay_etherstub()?; let underlay_etherstub_vnic = underlay_etherstub_vnic(&underlay_etherstub)?; let bootstrap_etherstub = bootstrap_etherstub()?; let switch_zone_bootstrap_address = - BootstrapInterface::SwitchZone.ip(&self.config.link)?; + BootstrapInterface::SwitchZone.ip(&link)?; let hardware_monitor = HardwareMonitor::new( - &self.log, - &self.sled_config, - self.global_zone_bootstrap_link_local_address, + &log, + &sled_config, + global_zone_bootstrap_link_local_address, underlay_etherstub, underlay_etherstub_vnic, bootstrap_etherstub, @@ -537,9 +552,7 @@ impl Agent { // Record this request so the sled agent can be automatically // initialized on the next boot. - let paths = - sled_config_paths(&self.storage_resources.get().unwrap()) - .await; + let paths = sled_config_paths(&self.storage_resources).await; let mut ledger = Ledger::new_with( &self.log, paths, @@ -766,10 +779,7 @@ impl Agent { .as_ref() .map(|sp_config| sp_config.trust_quorum_members.clone()) .unwrap_or_default(), - self.storage_resources - .get() - .expect("Should set storage during initialization") - .clone(), + self.storage_resources.clone(), ) .await?; Ok(()) @@ -819,8 +829,6 @@ impl Agent { ) -> Result<(), BootstrapError> { let config_dirs = self .storage_resources - .get() - .unwrap() .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) .await .into_iter(); From b201d58b47b48267d90e101a482d62b85a2da5ce Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 May 2023 17:14:56 -0400 Subject: [PATCH 22/39] Fix sled agent tests --- sled-agent/src/bootstrap/agent.rs | 1 - sled-agent/src/ledger.rs | 28 +++++++++------------------- sled-agent/src/services.rs | 5 +---- 3 files changed, 10 insertions(+), 24 deletions(-) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index e305d84885d..fc408667e08 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -1003,7 +1003,6 @@ mod tests { test_setup_log("persistent_sled_agent_request_serialization"); let log = &logctx.log; - // No secret generated if we have <= 1 sled let secret = RackSecret::new(); let (mut shares, verifier) = secret.split(2, 4).unwrap(); diff --git a/sled-agent/src/ledger.rs b/sled-agent/src/ledger.rs index b9bb39d2bb7..6826c5c21dc 100644 --- a/sled-agent/src/ledger.rs +++ b/sled-agent/src/ledger.rs @@ -228,7 +228,7 @@ mod test { let log = &logctx.log; let config_dir = camino_tempfile::Utf8TempDir::new().unwrap(); - let ledger = Ledger::::new_with( + let ledger = Ledger::new_with( &log, vec![config_dir.path().to_path_buf()], Data::default(), @@ -255,11 +255,8 @@ mod test { let config_path = config_dir.path().join("ledger.toml"); // Create the ledger within a configuration directory - let mut ledger = Ledger::::new_with( - &log, - vec![config_path.clone()], - Data::default(), - ); + let mut ledger = + Ledger::new_with(&log, vec![config_path.clone()], Data::default()); ledger.data_mut().contents = "new contents".to_string(); ledger.commit().await.expect("Failed to write ledger"); assert!(config_path.exists()); @@ -291,9 +288,8 @@ mod test { .map(|d| d.path().join("ledger.toml")) .collect::>(); - let mut ledger = Ledger::::new(&log, config_paths.clone()) - .await - .expect("Failed to read ledger"); + let mut ledger = + Ledger::new_with(&log, config_paths.clone(), Data::default()); ledger.data_mut().contents = "new contents".to_string(); ledger.commit().await.expect("Failed to write ledger"); @@ -337,11 +333,8 @@ mod test { .map(|d| d.path().join("ledger.toml")) .collect::>(); - let mut ledger = Ledger::::new_with( - &log, - config_paths.clone(), - Data::default(), - ); + let mut ledger = + Ledger::new_with(&log, config_paths.clone(), Data::default()); ledger.data_mut().contents = "written to both configs".to_string(); ledger.commit().await.expect("Failed to write ledger"); @@ -387,11 +380,8 @@ mod test { let ledger = Ledger::::new(&log, config_paths.clone()).await; assert!(ledger.is_none()); - let mut ledger = Ledger::::new_with( - &log, - config_paths.clone(), - Data::default(), - ); + let mut ledger = + Ledger::new_with(&log, config_paths.clone(), Data::default()); assert_eq!(ledger.data(), &Data::default()); let err = ledger.commit().await.unwrap_err(); assert!( diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index a8538dbf889..8f90b710f86 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -94,12 +94,9 @@ use uuid::Uuid; #[derive(thiserror::Error, Debug)] pub enum Error { - #[error("Cannot serialize TOML to file {path}: {err}")] + #[error("Cannot serialize TOML to file: {path}: {err}")] TomlSerialize { path: Utf8PathBuf, err: toml::ser::Error }, - #[error("Cannot deserialize TOML from file {path}: {err}")] - TomlDeserialize { path: Utf8PathBuf, err: toml::de::Error }, - #[error("Failed to perform I/O: {message}: {err}")] Io { message: String, From e3fe7e2035ce987da4661b80f8f017dc22d60c68 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 May 2023 17:17:35 -0400 Subject: [PATCH 23/39] Remove /var/oxide from buildomat --- .github/buildomat/jobs/deploy.sh | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index cd484377c6f..dcaa101d162 100644 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -111,13 +111,6 @@ fi # pfexec /sbin/zfs create -o mountpoint=/zone rpool/zone -# -# The sled agent will ostensibly write things into /var/oxide, so make that a -# tmpfs as well: -# -pfexec mkdir -p /var/oxide -pfexec mount -F tmpfs -O swap /var/oxide - pfexec mkdir /opt/oxide/work pfexec chown build:build /opt/oxide/work cd /opt/oxide/work @@ -148,10 +141,7 @@ pfexec curl -sSfL -o /var/svc/manifest/site/tcpproxy.xml \ pfexec svccfg import /var/svc/manifest/site/tcpproxy.xml # -# This OMICRON_NO_UNINSTALL hack here is so that there is no implicit uninstall -# before the install. This doesn't work right now because, above, we made -# /var/oxide a file system so you can't remove it (EBUSY) like a regular -# directory. The lab-netdev target is a ramdisk system that is always cleared +# The lab-netdev target is a ramdisk system that is always cleared # out between runs, so it has not had any state yet that requires # uninstallation. # From 92acdac0e18308b8b48e185f3b73250c2c73dcc7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 13:16:21 -0400 Subject: [PATCH 24/39] Ensure bootstrap agent sees all disks, await boot disk --- sled-agent/src/bootstrap/agent.rs | 18 ++++++++++++++++++ sled-agent/src/bootstrap/hardware.rs | 4 ++++ sled-agent/src/storage_manager.rs | 19 +++++++++++++++++++ sled-hardware/src/disk.rs | 4 ++++ 4 files changed, 45 insertions(+) diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index fc408667e08..e20601fbf03 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -392,6 +392,24 @@ impl Agent { global_zone_bootstrap_link_local_address, }; + // Wait for at least the M.2 we booted from to show up. + // + // This gives the bootstrap agent a chance to read locally-stored + // configs if any exist. + loop { + match agent.storage_resources.boot_disk().await { + Some(disk) => { + info!(agent.log, "Found boot disk M.2: {disk:?}"); + break; + } + None => { + info!(agent.log, "Waiting for boot disk M.2..."); + tokio::time::sleep(core::time::Duration::from_millis(250)) + .await; + } + } + } + let paths = sled_config_paths(&agent.storage_resources).await; let trust_quorum = if let Some(ledger) = Ledger::::new(&agent.log, paths).await diff --git a/sled-agent/src/bootstrap/hardware.rs b/sled-agent/src/bootstrap/hardware.rs index 6c178642d2f..a8538d15e4e 100644 --- a/sled-agent/src/bootstrap/hardware.rs +++ b/sled-agent/src/bootstrap/hardware.rs @@ -123,6 +123,10 @@ impl HardwareMonitorWorker { warn!(self.log, "Failed to deactivate switch: {e}"); } } + + self.storage + .ensure_using_exactly_these_disks(self.hardware.disks()) + .await; } } diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 73ddd75cdd4..e1b767599b3 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -220,6 +220,24 @@ pub struct StorageResources { } impl StorageResources { + /// Returns the identity of the boot disk. + /// + /// If this returns `None`, we have not processed the boot disk yet. + pub async fn boot_disk(&self) -> Option { + let disks = self.disks.lock().await; + disks.iter().find_map(|(id, disk)| { + match disk { + DiskWrapper::Real { disk, .. } => { + if disk.is_boot_disk() { + return Some(id.clone()); + } + } + _ => (), + }; + None + }) + } + /// Returns all M.2 zpools pub async fn all_m2_zpools(&self) -> Vec { self.all_zpools(DiskVariant::M2).await @@ -234,6 +252,7 @@ impl StorageResources { .collect() } + /// Returns all zpools of a particular variant pub async fn all_zpools(&self, variant: DiskVariant) -> Vec { let disks = self.disks.lock().await; disks diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index 336547ceec2..ef97b9d2ab3 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -171,6 +171,10 @@ impl UnparsedDisk { pub fn identity(&self) -> &DiskIdentity { &self.identity } + + pub fn is_boot_disk(&self) -> bool { + self.is_boot_disk + } } /// A physical disk conforming to the expected partition layout. From ef495159b214c9c3fa664f8283925608f2b871c2 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 13:26:10 -0400 Subject: [PATCH 25/39] synthetic --- sled-agent/src/storage_manager.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index e1b767599b3..852af3b054c 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -227,12 +227,20 @@ impl StorageResources { let disks = self.disks.lock().await; disks.iter().find_map(|(id, disk)| { match disk { + // This is the "real" use-case: if we have real disks, query + // their properties to identify if they truly are the boot disk. DiskWrapper::Real { disk, .. } => { if disk.is_boot_disk() { return Some(id.clone()); } } - _ => (), + // This is the "less real" use-case: if we have synthetic disks, + // just label the first M.2-looking one as a "boot disk". + DiskWrapper::Synthetic { .. } => { + if matches!(disk.variant(), DiskVariant::M2) { + return Some(disk.identity()); + } + } }; None }) From 9692493cff84a08833f75b19f63f43115c366237 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 15:37:11 -0400 Subject: [PATCH 26/39] wip --- illumos-utils/src/running_zone.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 9676e5a48f2..ee0efe34880 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -616,6 +616,7 @@ impl InstalledZone { &self.zonepath } + // TODO: This would benefit from a "builder-pattern" interface. #[allow(clippy::too_many_arguments)] pub async fn install( log: &Logger, @@ -639,6 +640,10 @@ impl InstalledZone { } })?; + // TODO: This has gotta change. + // TODO: Reach into M.2 for the correct pathway. + // + // TODO: Could *also* look in /opt/oxide? just like a "paths" argument? let full_zone_name = Self::get_zone_name(zone_type, unique_name); let zone_image_path = Utf8PathBuf::from(format!("/opt/oxide/{}.tar.gz", zone_type)); From 4b6b5840279ce6708b895ba17bf3c20b245e7f81 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 17:05:30 -0400 Subject: [PATCH 27/39] Inject synthetic zpools in bootstrap agent --- sled-agent/src/bootstrap/hardware.rs | 12 ++++++++++++ sled-agent/src/sled_agent.rs | 10 ---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/bootstrap/hardware.rs b/sled-agent/src/bootstrap/hardware.rs index a8538d15e4e..16d35ee812b 100644 --- a/sled-agent/src/bootstrap/hardware.rs +++ b/sled-agent/src/bootstrap/hardware.rs @@ -191,6 +191,18 @@ impl HardwareMonitor { let storage_manager = StorageManager::new(&log).await; + // If our configuration asks for synthetic zpools, insert them now. + if let Some(pools) = &sled_config.zpools { + for pool in pools { + info!( + log, + "Upserting synthetic zpool to Storage Manager: {}", + pool.to_string() + ); + storage_manager.upsert_synthetic_disk(pool.clone()).await; + } + } + let service_manager = ServiceManager::new( log.clone(), global_zone_bootstrap_link_local_address, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index e19d64ae69e..a9f8f5b9238 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -238,16 +238,6 @@ impl SledAgent { sled_id: request.id, }) .await?; - if let Some(pools) = &config.zpools { - for pool in pools { - info!( - log, - "Sled Agent upserting zpool to Storage Manager: {}", - pool.to_string() - ); - storage.upsert_synthetic_disk(pool.clone()).await; - } - } let instances = InstanceManager::new( parent_log.clone(), From e31ca8876ef997d2515b239e1d1bafdebd2a9b66 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 17:10:34 -0400 Subject: [PATCH 28/39] Update path in comment --- sled-agent/src/rack_setup/service.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 1b6c0b8d98f..bb28ce22837 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -16,9 +16,9 @@ //! Rack setup occurs in distinct phases which are denoted by the prescence of //! configuration files. //! -//! - /var/oxide/rss-sled-plan.toml (Sled Plan) -//! - /var/oxide/rss-service-plan.toml (Service Plan) -//! - /var/oxide/rss-plan-completed.marker (Plan Execution Complete) +//! - /pool/int//config/rss-sled-plan.toml (Sled Plan) +//! - /pool/int//config/rss-service-plan.toml (Service Plan) +//! - /pool/int//config/rss-plan-completed.marker (Plan Execution Complete) //! //! ## Sled Plan //! From 164b5fdbdd0ea03806f0c315403147f4a4e88df3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 17:17:14 -0400 Subject: [PATCH 29/39] fix documentation --- sled-agent/src/ledger.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/ledger.rs b/sled-agent/src/ledger.rs index 6826c5c21dc..519b1179777 100644 --- a/sled-agent/src/ledger.rs +++ b/sled-agent/src/ledger.rs @@ -69,9 +69,8 @@ impl Ledger { /// Reads the ledger from any of the provided `paths`. /// - /// Returns the following, in order: - /// - The ledger with the highest generation number - /// - If none exists, returns a default ledger + /// Returns the ledger with the highest generation number if it + /// exists, otherwise returns `None`. pub async fn new(log: &Logger, paths: Vec) -> Option { // Read the ledgers from storage if let Some(ledger) = Self::read(log, &paths).await { From d6ad50bc07a9c15b30c1cae6012229efcc393365 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 17:26:04 -0400 Subject: [PATCH 30/39] rustdoc complaints --- sled-agent/src/rack_setup/service.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index bb28ce22837..c8efcd11e82 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -16,9 +16,9 @@ //! Rack setup occurs in distinct phases which are denoted by the prescence of //! configuration files. //! -//! - /pool/int//config/rss-sled-plan.toml (Sled Plan) -//! - /pool/int//config/rss-service-plan.toml (Service Plan) -//! - /pool/int//config/rss-plan-completed.marker (Plan Execution Complete) +//! - /pool/int/UUID/config/rss-sled-plan.toml (Sled Plan) +//! - /pool/int/UUID/config/rss-service-plan.toml (Service Plan) +//! - /pool/int/UUID/config/rss-plan-completed.marker (Plan Execution Complete) //! //! ## Sled Plan //! From b372f4de81bc9a37cfb8c2339aaa5460e093fe14 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 20:56:08 -0400 Subject: [PATCH 31/39] [sled-agent] Look for zone images in the install dataset too --- illumos-utils/src/running_zone.rs | 30 ++++++++++++++++++++++++------ sled-agent/src/instance.rs | 1 + sled-agent/src/services.rs | 14 ++++++++++++++ sled-agent/src/storage_manager.rs | 6 +++--- 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 363aa2b90e7..3ce76113310 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -564,6 +564,9 @@ pub enum InstallZoneError { #[source] err: crate::zone::AdmError, }, + + #[error("Failed to find zone image '{image}' from `{paths:?}'")] + ImageNotFound { image: String, paths: Vec }, } pub struct InstalledZone { @@ -626,6 +629,7 @@ impl InstalledZone { log: &Logger, underlay_vnic_allocator: &VnicAllocator, zone_root_path: &Utf8Path, + zone_image_paths: &[Utf8PathBuf], zone_type: &str, unique_name: Option<&str>, datasets: &[zone::Dataset], @@ -644,13 +648,27 @@ impl InstalledZone { } })?; - // TODO: This has gotta change. - // TODO: Reach into M.2 for the correct pathway. - // - // TODO: Could *also* look in /opt/oxide? just like a "paths" argument? let full_zone_name = Self::get_zone_name(zone_type, unique_name); - let zone_image_path = - Utf8PathBuf::from(format!("/opt/oxide/{}.tar.gz", zone_type)); + + // Looks for the image within `zone_image_path`, in order. + let image = format!("{}.tar.gz", zone_type); + let zone_image_path = zone_image_paths + .iter() + .find_map(|image_path| { + let path = image_path.join(&image); + if path.exists() { + Some(path) + } else { + None + } + }) + .ok_or_else(|| InstallZoneError::ImageNotFound { + image: image.to_string(), + paths: zone_image_paths + .iter() + .map(|p| p.to_path_buf()) + .collect(), + })?; let net_device_names: Vec = opte_ports .iter() diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 5e7e6ef7f7d..b42641901a6 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -772,6 +772,7 @@ impl Instance { &inner.log, &inner.vnic_allocator, &root, + &["/opt/oxide".into()], "propolis-server", Some(&inner.propolis_id().to_string()), // dataset= diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 8f90b710f86..c70cbd38997 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -930,10 +930,24 @@ impl ServiceManager { .map(|d| zone::Device { name: d.to_string() }) .collect(); + // Look for the image in the ramdisk first + let mut zone_image_paths = vec!["/opt/oxide".into()]; + // If the boot disk exists, look for the image in the "install" dataset + // there too. + if let Some((_, boot_zpool)) = + self.inner.storage.resources().boot_disk().await + { + zone_image_paths.push( + boot_zpool + .dataset_mountpoint(sled_hardware::disk::INSTALL_DATASET), + ); + } + let installed_zone = InstalledZone::install( &self.inner.log, &self.inner.underlay_vnic_allocator, &request.root, + zone_image_paths.as_slice(), &request.zone.zone_type.to_string(), unique_name.as_deref(), datasets.as_slice(), diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 852af3b054c..ce7d3cbdc0a 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -223,7 +223,7 @@ impl StorageResources { /// Returns the identity of the boot disk. /// /// If this returns `None`, we have not processed the boot disk yet. - pub async fn boot_disk(&self) -> Option { + pub async fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { let disks = self.disks.lock().await; disks.iter().find_map(|(id, disk)| { match disk { @@ -231,14 +231,14 @@ impl StorageResources { // their properties to identify if they truly are the boot disk. DiskWrapper::Real { disk, .. } => { if disk.is_boot_disk() { - return Some(id.clone()); + return Some((id.clone(), disk.zpool_name().clone())); } } // This is the "less real" use-case: if we have synthetic disks, // just label the first M.2-looking one as a "boot disk". DiskWrapper::Synthetic { .. } => { if matches!(disk.variant(), DiskVariant::M2) { - return Some(disk.identity()); + return Some((id.clone(), disk.zpool_name().clone())); } } }; From 529a0357bcda9e073b1e1da25a95a719340d9eb1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 22:24:19 -0400 Subject: [PATCH 32/39] moving dns to be a dataset-compatible filesystem --- common/src/sql/dbinit.sql | 4 +- nexus/db-model/src/dataset_kind.rs | 8 +++ nexus/types/src/internal_api/params.rs | 21 ++---- openapi/nexus-internal.json | 4 +- openapi/sled-agent.json | 55 ++++++++++++++ sled-agent/src/http_entrypoints.rs | 1 + sled-agent/src/params.rs | 43 ++++++++++- sled-agent/src/rack_setup/plan/service.rs | 88 ++++++++++------------- sled-agent/src/sled_agent.rs | 7 +- smf/external-dns/config.toml | 2 +- smf/internal-dns/config.toml | 2 +- 11 files changed, 159 insertions(+), 76 deletions(-) diff --git a/common/src/sql/dbinit.sql b/common/src/sql/dbinit.sql index 957f2d3da78..20db3b9f036 100644 --- a/common/src/sql/dbinit.sql +++ b/common/src/sql/dbinit.sql @@ -364,7 +364,9 @@ CREATE TABLE omicron.public.Zpool ( CREATE TYPE omicron.public.dataset_kind AS ENUM ( 'crucible', 'cockroach', - 'clickhouse' + 'clickhouse', + 'external_dns', + 'internal_dns' ); /* diff --git a/nexus/db-model/src/dataset_kind.rs b/nexus/db-model/src/dataset_kind.rs index e2c0510ab3d..f4c6a5eee66 100644 --- a/nexus/db-model/src/dataset_kind.rs +++ b/nexus/db-model/src/dataset_kind.rs @@ -19,6 +19,8 @@ impl_enum_type!( Crucible => b"crucible" Cockroach => b"cockroach" Clickhouse => b"clickhouse" + ExternalDns => b"external_dns" + InternalDns => b"internal_dns" ); impl From for DatasetKind { @@ -33,6 +35,12 @@ impl From for DatasetKind { internal_api::params::DatasetKind::Clickhouse => { DatasetKind::Clickhouse } + internal_api::params::DatasetKind::ExternalDns => { + DatasetKind::ExternalDns + } + internal_api::params::DatasetKind::InternalDns => { + DatasetKind::InternalDns + } } } } diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index ce86de17131..7a3963229ce 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -14,7 +14,6 @@ use std::fmt; use std::net::IpAddr; use std::net::SocketAddr; use std::net::SocketAddrV6; -use std::str::FromStr; use uuid::Uuid; /// Describes the role of the sled within the rack. @@ -116,6 +115,8 @@ pub enum DatasetKind { Crucible, Cockroach, Clickhouse, + ExternalDns, + InternalDns, } impl fmt::Display for DatasetKind { @@ -125,27 +126,13 @@ impl fmt::Display for DatasetKind { Crucible => "crucible", Cockroach => "cockroach", Clickhouse => "clickhouse", + ExternalDns => "external_dns", + InternalDns => "internal_dns", }; write!(f, "{}", s) } } -impl FromStr for DatasetKind { - type Err = omicron_common::api::external::Error; - - fn from_str(s: &str) -> Result { - use DatasetKind::*; - match s { - "crucible" => Ok(Crucible), - "cockroach" => Ok(Cockroach), - "clickhouse" => Ok(Clickhouse), - _ => Err(Self::Err::InternalError { - internal_message: format!("Unknown dataset kind: {}", s), - }), - } - } -} - /// Describes a dataset within a pool. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub struct DatasetPutRequest { diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 79d54135cf0..966a78dc5eb 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -885,7 +885,9 @@ "enum": [ "crucible", "cockroach", - "clickhouse" + "clickhouse", + "external_dns", + "internal_dns" ] }, "DatasetPutRequest": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 25d491adf84..6f192e438d4 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -678,6 +678,61 @@ "required": [ "type" ] + }, + { + "type": "object", + "properties": { + "dns_address": { + "description": "The address at which the external DNS server is reachable.", + "type": "string" + }, + "http_address": { + "description": "The address at which the external DNS server API is reachable.", + "type": "string" + }, + "nic": { + "description": "The service vNIC providing external connectivity using OPTE.", + "allOf": [ + { + "$ref": "#/components/schemas/NetworkInterface" + } + ] + }, + "type": { + "type": "string", + "enum": [ + "external_dns" + ] + } + }, + "required": [ + "dns_address", + "http_address", + "nic", + "type" + ] + }, + { + "type": "object", + "properties": { + "dns_address": { + "type": "string" + }, + "http_address": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "internal_dns" + ] + } + }, + "required": [ + "dns_address", + "http_address", + "type" + ] } ] }, diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 4a0db86ed59..3c82fbaf590 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -107,6 +107,7 @@ async fn filesystem_put( body_args.zpool_id, body_args.dataset_kind, body_args.address, + body_args.gz_address, ) .await .map_err(|e| Error::from(e))?; diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 89b5698d96a..dc17655324f 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -216,6 +216,18 @@ pub enum DatasetKind { CockroachDb, Crucible, Clickhouse, + ExternalDns { + /// The address at which the external DNS server API is reachable. + http_address: SocketAddrV6, + /// The address at which the external DNS server is reachable. + dns_address: SocketAddr, + /// The service vNIC providing external connectivity using OPTE. + nic: NetworkInterface, + }, + InternalDns { + http_address: SocketAddrV6, + dns_address: SocketAddrV6, + }, } impl DatasetKind { @@ -225,6 +237,8 @@ impl DatasetKind { DatasetKind::CockroachDb => ZoneType::CockroachDb, DatasetKind::Crucible => ZoneType::Crucible, DatasetKind::Clickhouse => ZoneType::Clickhouse, + DatasetKind::ExternalDns { .. } => ZoneType::ExternalDns, + DatasetKind::InternalDns { .. } => ZoneType::InternalDns, } } @@ -234,10 +248,16 @@ impl DatasetKind { /// service in their zone. If that precondition is no longer true, this /// interface should be re-visited. pub fn service_type(&self) -> ServiceType { - match *self { + match self.clone() { DatasetKind::CockroachDb => ServiceType::CockroachDb, DatasetKind::Crucible => ServiceType::Crucible, DatasetKind::Clickhouse => ServiceType::Clickhouse, + DatasetKind::ExternalDns { http_address, dns_address, nic } => { + ServiceType::ExternalDns { http_address, dns_address, nic } + } + DatasetKind::InternalDns { http_address, dns_address } => { + ServiceType::InternalDns { http_address, dns_address } + } } } } @@ -249,6 +269,17 @@ impl From for sled_agent_client::types::DatasetKind { CockroachDb => Self::CockroachDb, Crucible => Self::Crucible, Clickhouse => Self::Clickhouse, + ExternalDns { http_address, dns_address, nic } => { + Self::ExternalDns { + http_address: http_address.to_string(), + dns_address: dns_address.to_string(), + nic: nic.into(), + } + } + InternalDns { http_address, dns_address } => Self::InternalDns { + http_address: http_address.to_string(), + dns_address: dns_address.to_string(), + }, } } } @@ -260,6 +291,8 @@ impl From for nexus_client::types::DatasetKind { CockroachDb { .. } => Self::Cockroach, Crucible => Self::Crucible, Clickhouse => Self::Clickhouse, + ExternalDns { .. } => Self::ExternalDns, + InternalDns { .. } => Self::InternalDns, } } } @@ -271,6 +304,8 @@ impl std::fmt::Display for DatasetKind { Crucible => "crucible", CockroachDb { .. } => "cockroachdb", Clickhouse => "clickhouse", + ExternalDns { .. } => "external_dns", + InternalDns { .. } => "internal_dns", }; write!(f, "{}", s) } @@ -289,7 +324,11 @@ pub struct DatasetEnsureBody { // The type of the filesystem. pub dataset_kind: DatasetKind, // The address on which the zone will listen for requests. - pub address: SocketAddrV6, + pub address: Ipv6Addr, + // The addresses in the global zone which should be created, if necessary + // to route to the service. + #[serde(default)] + pub gz_address: Option, } impl From for sled_agent_client::types::DatasetEnsureBody { diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 2d1f7a01f3f..6a0ae0b58da 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -296,26 +296,22 @@ impl Plan { .unwrap(); let (nic, external_ip) = svc_port_builder.next_dns(id, &mut services_ip_pool)?; - request.services.push(ServiceZoneRequest { + request.datasets.push(DatasetEnsureBody { id, - zone_type: ZoneType::ExternalDns, - addresses: vec![internal_ip], - dataset: None, - gz_addresses: vec![], - services: vec![ServiceZoneService { - id, - details: ServiceType::ExternalDns { - http_address: SocketAddrV6::new( - internal_ip, - http_port, - 0, - 0, - ), - dns_address: SocketAddr::new(external_ip, dns_port), - nic, - }, - }], - }) + zpool_id: u2_zpools[0], + dataset_kind: crate::params::DatasetKind::ExternalDns { + http_address: SocketAddrV6::new( + internal_ip, + http_port, + 0, + 0, + ), + dns_address: SocketAddr::new(external_ip, dns_port), + nic, + }, + address: internal_ip, + gz_address: None, + }); } // The first enumerated sleds get assigned the responsibility @@ -385,12 +381,12 @@ impl Plan { dns_builder .service_backend_zone(ServiceName::Cockroach, &zone, port) .unwrap(); - let address = SocketAddrV6::new(address, port, 0, 0); request.datasets.push(DatasetEnsureBody { id, zpool_id: u2_zpools[0], dataset_kind: crate::params::DatasetKind::CockroachDb, address, + gz_address: None, }); } @@ -403,40 +399,36 @@ impl Plan { dns_builder .service_backend_zone(ServiceName::Clickhouse, &zone, port) .unwrap(); - let address = SocketAddrV6::new(address, port, 0, 0); request.datasets.push(DatasetEnsureBody { id, zpool_id: u2_zpools[0], dataset_kind: crate::params::DatasetKind::Clickhouse, address, + gz_address: None, }); } // Each zpool gets a crucible zone. // // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove - for zpool_id in u2_zpools { - let address = SocketAddrV6::new( - addr_alloc.next().expect("Not enough addrs"), - omicron_common::address::CRUCIBLE_PORT, - 0, - 0, - ); + for zpool_id in &u2_zpools { + let address = addr_alloc.next().expect("Not enough addrs"); let id = Uuid::new_v4(); - let zone = dns_builder.host_zone(id, *address.ip()).unwrap(); + let zone = dns_builder.host_zone(id, address).unwrap(); dns_builder .service_backend_zone( ServiceName::Crucible(id), &zone, - address.port(), + omicron_common::address::CRUCIBLE_PORT, ) .unwrap(); request.datasets.push(DatasetEnsureBody { id, - zpool_id, + zpool_id: *zpool_id, dataset_kind: crate::params::DatasetKind::Crucible, address, + gz_address: None, }); } @@ -454,26 +446,22 @@ impl Plan { DNS_HTTP_PORT, ) .unwrap(); - request.services.push(ServiceZoneRequest { + request.datasets.push(DatasetEnsureBody { id, - zone_type: ZoneType::InternalDns, - addresses: vec![dns_addr], - dataset: None, - gz_addresses: vec![dns_subnet.gz_address().ip()], - services: vec![ServiceZoneService { - id, - details: ServiceType::InternalDns { - http_address: SocketAddrV6::new( - dns_addr, - DNS_HTTP_PORT, - 0, - 0, - ), - dns_address: SocketAddrV6::new( - dns_addr, DNS_PORT, 0, 0, - ), - }, - }], + zpool_id: u2_zpools[0], + dataset_kind: crate::params::DatasetKind::InternalDns { + http_address: SocketAddrV6::new( + dns_addr, + DNS_HTTP_PORT, + 0, + 0, + ), + dns_address: SocketAddrV6::new( + dns_addr, DNS_PORT, 0, 0, + ), + }, + address: dns_addr, + gz_address: Some(dns_subnet.gz_address().ip()), }); } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index a9f8f5b9238..1a8851772ef 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -516,7 +516,8 @@ impl SledAgent { dataset_id: Uuid, zpool_id: Uuid, dataset_kind: DatasetKind, - address: SocketAddrV6, + address: Ipv6Addr, + gz_address: Option, ) -> Result<(), Error> { // First, ensure the dataset exists let dataset = self @@ -541,9 +542,9 @@ impl SledAgent { let request = crate::params::ServiceZoneRequest { id: dataset_id, zone_type: dataset_kind.zone_type(), - addresses: vec![*address.ip()], + addresses: vec![address], dataset: Some(dataset), - gz_addresses: vec![], + gz_addresses: gz_address.into_iter().collect(), services, }; self.inner.services.ensure_storage_service(request).await?; diff --git a/smf/external-dns/config.toml b/smf/external-dns/config.toml index 4f572ddbc1b..b90224f48c2 100644 --- a/smf/external-dns/config.toml +++ b/smf/external-dns/config.toml @@ -11,5 +11,5 @@ path = "/dev/stdout" if_exists = "append" [storage] -storage_path = "/var/oxide/dns" +storage_path = "/data" keep_old_generations = 3 diff --git a/smf/internal-dns/config.toml b/smf/internal-dns/config.toml index 4f572ddbc1b..b90224f48c2 100644 --- a/smf/internal-dns/config.toml +++ b/smf/internal-dns/config.toml @@ -11,5 +11,5 @@ path = "/dev/stdout" if_exists = "append" [storage] -storage_path = "/var/oxide/dns" +storage_path = "/data" keep_old_generations = 3 From e1c378cd6ff3abc03655be75a791328585109ac6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 4 May 2023 23:38:46 -0400 Subject: [PATCH 33/39] Socketaddrv6 usage --- openapi/sled-agent.json | 6 +++ sled-agent/src/params.rs | 3 +- sled-agent/src/rack_setup/plan/service.rs | 36 ++++++++------ sled-agent/src/rack_setup/service.rs | 60 +++++++---------------- sled-agent/src/sled_agent.rs | 4 +- 5 files changed, 49 insertions(+), 60 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 6f192e438d4..b836658cdf5 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -618,6 +618,12 @@ "dataset_kind": { "$ref": "#/components/schemas/DatasetKind" }, + "gz_address": { + "nullable": true, + "default": null, + "type": "string", + "format": "ipv6" + }, "id": { "type": "string", "format": "uuid" diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index dc17655324f..972e95e58bf 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -324,7 +324,7 @@ pub struct DatasetEnsureBody { // The type of the filesystem. pub dataset_kind: DatasetKind, // The address on which the zone will listen for requests. - pub address: Ipv6Addr, + pub address: SocketAddrV6, // The addresses in the global zone which should be created, if necessary // to route to the service. #[serde(default)] @@ -337,6 +337,7 @@ impl From for sled_agent_client::types::DatasetEnsureBody { zpool_id: p.zpool_id, dataset_kind: p.dataset_kind.into(), address: p.address.to_string(), + gz_address: p.gz_address, id: p.id, } } diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 6a0ae0b58da..ff57a944a2c 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -284,6 +284,8 @@ impl Plan { if idx < EXTERNAL_DNS_COUNT { let internal_ip = addr_alloc.next().expect("Not enough addrs"); let http_port = omicron_common::address::DNS_HTTP_PORT; + let http_address = + SocketAddrV6::new(internal_ip, http_port, 0, 0); let dns_port = omicron_common::address::DNS_PORT; let id = Uuid::new_v4(); let zone = dns_builder.host_zone(id, internal_ip).unwrap(); @@ -309,7 +311,7 @@ impl Plan { dns_address: SocketAddr::new(external_ip, dns_port), nic, }, - address: internal_ip, + address: http_address, gz_address: None, }); } @@ -375,9 +377,10 @@ impl Plan { // zpools described from the underlying config file. if idx < CRDB_COUNT { let id = Uuid::new_v4(); - let address = addr_alloc.next().expect("Not enough addrs"); + let ip = addr_alloc.next().expect("Not enough addrs"); let port = omicron_common::address::COCKROACH_PORT; - let zone = dns_builder.host_zone(id, address).unwrap(); + let address = SocketAddrV6::new(ip, port, 0, 0); + let zone = dns_builder.host_zone(id, ip).unwrap(); dns_builder .service_backend_zone(ServiceName::Cockroach, &zone, port) .unwrap(); @@ -393,9 +396,10 @@ impl Plan { // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove if idx < CLICKHOUSE_COUNT { let id = Uuid::new_v4(); - let address = addr_alloc.next().expect("Not enough addrs"); + let ip = addr_alloc.next().expect("Not enough addrs"); let port = omicron_common::address::CLICKHOUSE_PORT; - let zone = dns_builder.host_zone(id, address).unwrap(); + let address = SocketAddrV6::new(ip, port, 0, 0); + let zone = dns_builder.host_zone(id, ip).unwrap(); dns_builder .service_backend_zone(ServiceName::Clickhouse, &zone, port) .unwrap(); @@ -412,14 +416,16 @@ impl Plan { // // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove for zpool_id in &u2_zpools { - let address = addr_alloc.next().expect("Not enough addrs"); + let ip = addr_alloc.next().expect("Not enough addrs"); + let port = omicron_common::address::CRUCIBLE_PORT; + let address = SocketAddrV6::new(ip, port, 0, 0); let id = Uuid::new_v4(); - let zone = dns_builder.host_zone(id, address).unwrap(); + let zone = dns_builder.host_zone(id, ip).unwrap(); dns_builder .service_backend_zone( ServiceName::Crucible(id), &zone, - omicron_common::address::CRUCIBLE_PORT, + port, ) .unwrap(); @@ -436,9 +442,11 @@ impl Plan { // responsibility of being internal DNS servers. if idx < dns_subnets.len() { let dns_subnet = &dns_subnets[idx]; - let dns_addr = dns_subnet.dns_address().ip(); + let dns_ip = dns_subnet.dns_address().ip(); + let http_address = + SocketAddrV6::new(dns_ip, DNS_HTTP_PORT, 0, 0); let id = Uuid::new_v4(); - let zone = dns_builder.host_zone(id, dns_addr).unwrap(); + let zone = dns_builder.host_zone(id, dns_ip).unwrap(); dns_builder .service_backend_zone( ServiceName::InternalDns, @@ -451,16 +459,14 @@ impl Plan { zpool_id: u2_zpools[0], dataset_kind: crate::params::DatasetKind::InternalDns { http_address: SocketAddrV6::new( - dns_addr, + dns_ip, DNS_HTTP_PORT, 0, 0, ), - dns_address: SocketAddrV6::new( - dns_addr, DNS_PORT, 0, 0, - ), + dns_address: SocketAddrV6::new(dns_ip, DNS_PORT, 0, 0), }, - address: dns_addr, + address: http_address, gz_address: Some(dns_subnet.gz_address().ip()), }); } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index c8efcd11e82..f2e6666bc3f 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -62,7 +62,7 @@ use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::ledger::{Ledger, Ledgerable}; use crate::nexus::d2n_params; use crate::params::{ - AutonomousServiceOnlyError, DatasetEnsureBody, ServiceType, + AutonomousServiceOnlyError, DatasetEnsureBody, DatasetKind, ServiceType, ServiceZoneRequest, TimeSync, ZoneType, }; use crate::rack_setup::plan::service::{ @@ -343,21 +343,23 @@ impl ServiceInner { // Start up the internal DNS services futures::future::join_all(service_plan.services.iter().map( |(sled_address, services_request)| async move { - let services: Vec<_> = services_request - .services + let datasets: Vec<_> = services_request + .datasets .iter() - .filter_map(|svc| { - if matches!(svc.zone_type, ZoneType::InternalDns) { - Some(svc.clone()) + .filter_map(|dataset| { + if matches!( + dataset.dataset_kind, + DatasetKind::InternalDns { .. } + ) { + Some(dataset.clone()) } else { None } }) .collect(); - if !services.is_empty() { - self.initialize_services(*sled_address, &services).await?; + if !datasets.is_empty() { + self.initialize_datasets(*sled_address, &datasets).await?; } - Ok(()) }, )) @@ -372,39 +374,13 @@ impl ServiceInner { service_plan.services.iter().filter_map( |(_, services_request)| { // iterate services for this sled - let dns_addrs: Vec<_> = services_request - .services + let dns_addrs: Vec = services_request + .datasets .iter() - .filter_map(|svc| { - if !matches!(svc.zone_type, ZoneType::InternalDns) { - // This is not an internal DNS zone. - None - } else { - // This is an internal DNS zone. Find the IP - // and port that have been assigned to it. - // There should be exactly one. - let addrs = svc.services.iter().filter_map(|s| { - if let ServiceType::InternalDns { http_address, .. } = &s.details { - Some(*http_address) - } else { - None - } - }).collect::>(); - - if addrs.len() == 1 { - Some(addrs[0]) - } else { - warn!( - log, - "DNS configuration: expected one \ - InternalDns service for zone with \ - type ZoneType::InternalDns, but \ - found {} (zone {})", - addrs.len(), - svc.id, - ); - None - } + .filter_map(|dataset| { + match dataset.dataset_kind { + DatasetKind::InternalDns { http_address, .. } => Some(http_address), + _ => None, } }) .collect(); @@ -416,7 +392,7 @@ impl ServiceInner { } ) .flatten() - .collect::>(); + .collect::>(); let dns_config = &service_plan.dns_config; for ip_addr in dns_server_ips { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1a8851772ef..0b337d14894 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -516,7 +516,7 @@ impl SledAgent { dataset_id: Uuid, zpool_id: Uuid, dataset_kind: DatasetKind, - address: Ipv6Addr, + address: SocketAddrV6, gz_address: Option, ) -> Result<(), Error> { // First, ensure the dataset exists @@ -542,7 +542,7 @@ impl SledAgent { let request = crate::params::ServiceZoneRequest { id: dataset_id, zone_type: dataset_kind.zone_type(), - addresses: vec![address], + addresses: vec![*address.ip()], dataset: Some(dataset), gz_addresses: gz_address.into_iter().collect(), services, From d802924214478627269d88d82ca45ab8a7f71820 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 5 May 2023 11:15:57 -0400 Subject: [PATCH 34/39] Fix tests --- sled-agent/src/services.rs | 59 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index c70cbd38997..273de4afa18 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -312,6 +312,7 @@ pub struct ServiceManagerInner { // rather than simply placing them on the ramdisk. storage: StorageManager, ledger_directory_override: OnceCell, + image_directory_override: OnceCell, } // Late-binding information, only known once the sled agent is up and @@ -387,16 +388,22 @@ impl ServiceManager { switch_zone_bootstrap_address, storage, ledger_directory_override: OnceCell::new(), + image_directory_override: OnceCell::new(), }), }; Ok(mgr) } #[cfg(test)] - async fn override_ledger_directory(&self, path: Utf8PathBuf) { + fn override_ledger_directory(&self, path: Utf8PathBuf) { self.inner.ledger_directory_override.set(path).unwrap(); } + #[cfg(test)] + fn override_image_directory(&self, path: Utf8PathBuf) { + self.inner.image_directory_override.set(path).unwrap(); + } + pub fn switch_zone_bootstrap_address(&self) -> Ipv6Addr { self.inner.switch_zone_bootstrap_address } @@ -931,7 +938,12 @@ impl ServiceManager { .collect(); // Look for the image in the ramdisk first - let mut zone_image_paths = vec!["/opt/oxide".into()]; + let mut zone_image_paths = vec![Utf8PathBuf::from("/opt/oxide")]; + // Inject an image path if requested by a test. + if let Some(path) = self.inner.image_directory_override.get() { + zone_image_paths.push(path.clone()); + }; + // If the boot disk exists, look for the image in the "install" dataset // there too. if let Some((_, boot_zpool)) = @@ -2494,6 +2506,18 @@ mod test { sidecar_revision: "rev_whatever_its_a_test".to_string(), } } + + fn override_paths(&self, mgr: &ServiceManager) { + let dir = self.config_dir.path(); + mgr.override_ledger_directory(dir.to_path_buf()); + mgr.override_image_directory(dir.to_path_buf()); + + // We test launching "fake" versions of the zones, but the + // logic to find paths relies on checking the existence of + // files. + std::fs::write(dir.join("oximeter.tar.gz"), "Not a real file") + .unwrap(); + } } #[tokio::test] @@ -2519,10 +2543,7 @@ mod test { ) .await .unwrap(); - mgr.override_ledger_directory( - test_config.config_dir.path().to_path_buf(), - ) - .await; + test_config.override_paths(&mgr); let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2568,10 +2589,7 @@ mod test { ) .await .unwrap(); - mgr.override_ledger_directory( - test_config.config_dir.path().to_path_buf(), - ) - .await; + test_config.override_paths(&mgr); let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2620,10 +2638,7 @@ mod test { ) .await .unwrap(); - mgr.override_ledger_directory( - test_config.config_dir.path().to_path_buf(), - ) - .await; + test_config.override_paths(&mgr); let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2660,10 +2675,7 @@ mod test { ) .await .unwrap(); - mgr.override_ledger_directory( - test_config.config_dir.path().to_path_buf(), - ) - .await; + test_config.override_paths(&mgr); let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), @@ -2709,10 +2721,8 @@ mod test { ) .await .unwrap(); - mgr.override_ledger_directory( - test_config.config_dir.path().to_path_buf(), - ) - .await; + test_config.override_paths(&mgr); + let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), Ipv6Addr::new(0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01), @@ -2753,10 +2763,7 @@ mod test { ) .await .unwrap(); - mgr.override_ledger_directory( - test_config.config_dir.path().to_path_buf(), - ) - .await; + test_config.override_paths(&mgr); let port_manager = PortManager::new( logctx.log.new(o!("component" => "PortManager")), From ad7b72706ad4105a10b5b09ccdebf4484c9656b1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 5 May 2023 11:26:29 -0400 Subject: [PATCH 35/39] Box big errors for clippy --- sled-agent/src/storage_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index ce7d3cbdc0a..cb07e1a9922 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -82,7 +82,7 @@ pub enum Error { }, #[error("Dataset {name:?} exists with a different uuid (has {old}, requested {new})")] - UuidMismatch { name: DatasetName, old: Uuid, new: Uuid }, + UuidMismatch { name: Box, old: Uuid, new: Uuid }, #[error("Error parsing pool {name}'s size: {err}")] BadPoolSize { @@ -319,7 +319,7 @@ impl StorageWorker { if let Ok(id) = id_str.parse::() { if id != dataset_id { return Err(Error::UuidMismatch { - name: dataset_name.clone(), + name: Box::new(dataset_name.clone()), old: id, new: dataset_id, }); From 44adf072b7b864305eab9a8dd88a113080652b61 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 7 Jun 2023 18:32:26 +0000 Subject: [PATCH 36/39] Fix mismerge --- sled-agent/src/services.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 099fa24fb98..de2949bc8cb 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -2591,18 +2591,6 @@ mod test { } Ok(SecretState::Current(self.get_latest().await?)) } - - fn override_paths(&self, mgr: &ServiceManager) { - let dir = self.config_dir.path(); - mgr.override_ledger_directory(dir.to_path_buf()); - mgr.override_image_directory(dir.to_path_buf()); - - // We test launching "fake" versions of the zones, but the - // logic to find paths relies on checking the existence of - // files. - std::fs::write(dir.join("oximeter.tar.gz"), "Not a real file") - .unwrap(); - } } async fn spawn_key_manager(log: &Logger) -> StorageKeyRequester { From afef9ed56879b46ece10ad30ea6f88c47591d812 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sat, 17 Jun 2023 22:21:09 -0700 Subject: [PATCH 37/39] Fix minor mismerge, minimize diff, add comments --- openapi/sled-agent.json | 2 ++ sled-agent/src/bootstrap/agent.rs | 1 + sled-agent/src/bootstrap/hardware.rs | 12 ------------ sled-agent/src/params.rs | 2 ++ 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 42e4dc9bc3a..f7e89787499 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -721,9 +721,11 @@ "type": "object", "properties": { "dns_address": { + "description": "The address at which the internal DNS server is reachable.", "type": "string" }, "http_address": { + "description": "The address at which the internal DNS server API is reachable.", "type": "string" }, "type": { diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 39a347eaac8..4da8ae0d121 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -485,6 +485,7 @@ impl Agent { let bootstrap_etherstub = bootstrap_etherstub()?; let switch_zone_bootstrap_address = BootstrapInterface::SwitchZone.ip(&link)?; + let hardware_monitor = HardwareMonitor::new( &log, &sled_config, diff --git a/sled-agent/src/bootstrap/hardware.rs b/sled-agent/src/bootstrap/hardware.rs index c540aa78ce1..4132edfa761 100644 --- a/sled-agent/src/bootstrap/hardware.rs +++ b/sled-agent/src/bootstrap/hardware.rs @@ -208,18 +208,6 @@ impl HardwareMonitor { } } - // If our configuration asks for synthetic zpools, insert them now. - if let Some(pools) = &sled_config.zpools { - for pool in pools { - info!( - log, - "Upserting synthetic zpool to Storage Manager: {}", - pool.to_string() - ); - storage_manager.upsert_synthetic_disk(pool.clone()).await; - } - } - let service_manager = ServiceManager::new( log.clone(), global_zone_bootstrap_link_local_address, diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index f049acac007..868d293caa8 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -225,7 +225,9 @@ pub enum DatasetKind { nic: NetworkInterface, }, InternalDns { + /// The address at which the internal DNS server API is reachable. http_address: SocketAddrV6, + /// The address at which the internal DNS server is reachable. dns_address: SocketAddrV6, }, } From 5d857d9e982307f596e8a4a9f7d4f45d4f47d6fa Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Sun, 18 Jun 2023 13:58:37 -0700 Subject: [PATCH 38/39] minor cleanup --- sled-agent/src/rack_setup/plan/service.rs | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 118c9eac993..238d8f9a299 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -286,7 +286,6 @@ impl Plan { let http_port = omicron_common::address::DNS_HTTP_PORT; let http_address = SocketAddrV6::new(internal_ip, http_port, 0, 0); - let dns_port = omicron_common::address::DNS_PORT; let id = Uuid::new_v4(); let zone = dns_builder.host_zone(id, internal_ip).unwrap(); dns_builder @@ -298,17 +297,14 @@ impl Plan { .unwrap(); let (nic, external_ip) = svc_port_builder.next_dns(id, &mut services_ip_pool)?; + let dns_port = omicron_common::address::DNS_PORT; + let dns_address = SocketAddr::new(external_ip, dns_port); request.datasets.push(DatasetEnsureBody { id, zpool_id: u2_zpools[0], dataset_kind: crate::params::DatasetKind::ExternalDns { - http_address: SocketAddrV6::new( - internal_ip, - http_port, - 0, - 0, - ), - dns_address: SocketAddr::new(external_ip, dns_port), + http_address, + dns_address, nic, }, address: http_address, @@ -452,6 +448,7 @@ impl Plan { if idx < dns_subnets.len() { let dns_subnet = &dns_subnets[idx]; let dns_ip = dns_subnet.dns_address().ip(); + let dns_address = SocketAddrV6::new(dns_ip, DNS_PORT, 0, 0); let http_address = SocketAddrV6::new(dns_ip, DNS_HTTP_PORT, 0, 0); let id = Uuid::new_v4(); @@ -467,13 +464,8 @@ impl Plan { id, zpool_id: u2_zpools[0], dataset_kind: crate::params::DatasetKind::InternalDns { - http_address: SocketAddrV6::new( - dns_ip, - DNS_HTTP_PORT, - 0, - 0, - ), - dns_address: SocketAddrV6::new(dns_ip, DNS_PORT, 0, 0), + http_address, + dns_address, }, address: http_address, gz_address: Some(dns_subnet.gz_address().ip()), From bb9c30df174d51484b95493ab7de1643f12274d6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 20 Jun 2023 12:53:01 -0700 Subject: [PATCH 39/39] Notify nexus of both external DNS as dataset and service --- sled-agent/src/rack_setup/plan/service.rs | 32 +++++++++++++++++++---- sled-agent/src/services.rs | 11 ++++++-- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 238d8f9a299..f102d25c3a9 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -299,17 +299,39 @@ impl Plan { svc_port_builder.next_dns(id, &mut services_ip_pool)?; let dns_port = omicron_common::address::DNS_PORT; let dns_address = SocketAddr::new(external_ip, dns_port); + let dataset_kind = crate::params::DatasetKind::ExternalDns { + http_address, + dns_address, + nic: nic.clone(), + }; + request.datasets.push(DatasetEnsureBody { id, zpool_id: u2_zpools[0], - dataset_kind: crate::params::DatasetKind::ExternalDns { - http_address, - dns_address, - nic, - }, + dataset_kind: dataset_kind.clone(), address: http_address, gz_address: None, }); + request.services.push(ServiceZoneRequest { + id, + zone_type: ZoneType::ExternalDns, + addresses: vec![internal_ip], + dataset: Some(crate::storage::dataset::DatasetName::new( + illumos_utils::zpool::ZpoolName::new_external( + u2_zpools[0], + ), + dataset_kind, + )), + gz_addresses: vec![], + services: vec![ServiceZoneService { + id, + details: ServiceType::ExternalDns { + http_address, + dns_address, + nic, + }, + }], + }); } // The first enumerated sleds get assigned the responsibility diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index e22c6143cff..97cdd930136 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -565,7 +565,11 @@ impl ServiceManager { .map_err(|_| "already set".to_string()) .expect("Sled Agent should only start once"); - self.load_non_storage_services().await?; + self.load_non_storage_services().await.map_err(|e| { + error!(self.inner.log, "failed to launch non-storage services"; "error" => e.to_string()); + e + })?; + // TODO(https://github.com/oxidecomputer/omicron/issues/2973): // These will fail if the disks aren't attached. // Should we have a retry loop here? Kinda like we have with the switch @@ -573,7 +577,10 @@ impl ServiceManager { // // NOTE: We could totally do the same thing with // "load_non_storage_services". - self.load_storage_services().await?; + self.load_storage_services().await.map_err(|e| { + error!(self.inner.log, "failed to launch storage services"; "error" => e.to_string()); + e + })?; Ok(()) }