diff --git a/Cargo.lock b/Cargo.lock index 58be17f8c5..46e21ab933 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6430,6 +6430,7 @@ dependencies = [ "iddqd", "nexus-sled-agent-shared", "nexus-types", + "ntp-admin-client", "omicron-cockroach-metrics", "omicron-common", "omicron-sled-agent", @@ -6442,6 +6443,7 @@ dependencies = [ "sled-agent-types", "sled-agent-zone-images-examples", "slog", + "slog-error-chain", "strum 0.27.1", "swrite", "thiserror 2.0.12", @@ -7709,6 +7711,7 @@ dependencies = [ "nexus-test-utils", "nexus-test-utils-macros", "nexus-types", + "ntp-admin-client", "num-integer", "omicron-cockroach-metrics", "omicron-common", diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 674f311700..ffe813f80e 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -64,6 +64,7 @@ nexus-mgs-updates.workspace = true nexus-networking.workspace = true nexus-saga-recovery.workspace = true nexus-test-interface.workspace = true +ntp-admin-client.workspace = true num-integer.workspace = true omicron-cockroach-metrics.workspace = true openssl.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 7c1ee230de..eec64ea738 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -34,7 +34,7 @@ use nexus_db_schema::schema::{ inv_last_reconciliation_disk_result, inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_mupdate_override_non_boot, - inv_nvme_disk_firmware, inv_omicron_sled_config, + inv_ntp_timesync, inv_nvme_disk_firmware, inv_omicron_sled_config, inv_omicron_sled_config_dataset, inv_omicron_sled_config_disk, inv_omicron_sled_config_zone, inv_omicron_sled_config_zone_nic, inv_physical_disk, inv_root_of_trust, inv_root_of_trust_page, @@ -61,7 +61,7 @@ use nexus_sled_agent_shared::inventory::{ }; use nexus_types::inventory::{ BaseboardId, Caboose, CockroachStatus, Collection, NvmeFirmware, - PowerState, RotPage, RotSlot, + PowerState, RotPage, RotSlot, TimeSync, }; use omicron_common::api::external; use omicron_common::api::internal::shared::NetworkInterface; @@ -2788,6 +2788,38 @@ impl TryFrom for CockroachStatus { } } +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_ntp_timesync)] +pub struct InvNtpTimesync { + pub inv_collection_id: DbTypedUuid, + pub ip: ipv6::Ipv6Addr, + pub port: SqlU16, + pub synced: bool, +} + +impl InvNtpTimesync { + pub fn new( + inv_collection_id: CollectionUuid, + timesync: &TimeSync, + ) -> Result { + Ok(Self { + inv_collection_id: inv_collection_id.into(), + ip: timesync.addr.ip().into(), + port: timesync.addr.port().into(), + synced: timesync.synced, + }) + } +} + +impl From for nexus_types::inventory::TimeSync { + fn from(value: InvNtpTimesync) -> Self { + Self { + addr: SocketAddrV6::new(*value.ip, *value.port, 0, 0), + synced: value.synced, + } + } +} + #[cfg(test)] mod test { use nexus_types::inventory::NvmeFirmware; diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 307ada9b48..93b9dd84f9 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -42,6 +42,7 @@ use nexus_db_model::InvLastReconciliationDatasetResult; use nexus_db_model::InvLastReconciliationDiskResult; use nexus_db_model::InvLastReconciliationOrphanedDataset; use nexus_db_model::InvLastReconciliationZoneResult; +use nexus_db_model::InvNtpTimesync; use nexus_db_model::InvNvmeDiskFirmware; use nexus_db_model::InvOmicronSledConfig; use nexus_db_model::InvOmicronSledConfigDataset; @@ -93,6 +94,7 @@ use nexus_types::inventory::CockroachStatus; use nexus_types::inventory::Collection; use nexus_types::inventory::PhysicalDiskFirmware; use nexus_types::inventory::SledAgent; +use nexus_types::inventory::TimeSync; use omicron_cockroach_metrics::NodeId as CockroachNodeId; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; @@ -385,6 +387,13 @@ impl DataStore { .collect::, _>>() .map_err(|e| Error::internal_error(&e.to_string()))?; + let inv_ntp_timesync_records: Vec = collection + .ntp_timesync + .iter() + .map(|timesync| InvNtpTimesync::new(collection_id, timesync)) + .collect::, _>>() + .map_err(|e| Error::internal_error(&e.to_string()))?; + // This implementation inserts all records associated with the // collection in one transaction. This is primarily for simplicity. It // means we don't have to worry about other readers seeing a @@ -1424,6 +1433,14 @@ impl DataStore { .await?; } + // Insert the NTP info we've observed + if !inv_ntp_timesync_records.is_empty() { + use nexus_db_schema::schema::inv_ntp_timesync::dsl; + diesel::insert_into(dsl::inv_ntp_timesync) + .values(inv_ntp_timesync_records) + .execute_async(&conn) + .await?; + } // Finally, insert the list of errors. { @@ -1714,6 +1731,7 @@ impl DataStore { nerrors: usize, nclickhouse_keeper_membership: usize, ncockroach_status: usize, + nntp_timesync: usize, } let NumRowsDeleted { @@ -1744,6 +1762,7 @@ impl DataStore { nerrors, nclickhouse_keeper_membership, ncockroach_status, + nntp_timesync, } = self.transaction_retry_wrapper("inventory_delete_collection") .transaction(&conn, |conn| async move { @@ -2000,6 +2019,17 @@ impl DataStore { .execute_async(&conn) .await? }; + // Remove rows for NTP timesync + let nntp_timesync = { + use nexus_db_schema::schema::inv_ntp_timesync::dsl; + diesel::delete( + dsl::inv_ntp_timesync.filter( + dsl::inv_collection_id.eq(db_collection_id), + ), + ) + .execute_async(&conn) + .await? + }; Ok(NumRowsDeleted { ncollections, @@ -2029,6 +2059,7 @@ impl DataStore { nerrors, nclickhouse_keeper_membership, ncockroach_status, + nntp_timesync, }) }) .await @@ -2069,6 +2100,7 @@ impl DataStore { "nerrors" => nerrors, "nclickhouse_keeper_membership" => nclickhouse_keeper_membership, "ncockroach_status" => ncockroach_status, + "nntp_timesync" => nntp_timesync, ); Ok(()) @@ -3435,6 +3467,25 @@ impl DataStore { .collect::, Error>>()? }; + // Load TimeSync statuses + let ntp_timesync: IdOrdMap = { + use nexus_db_schema::schema::inv_ntp_timesync::dsl; + + let records: Vec = dsl::inv_ntp_timesync + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvNtpTimesync::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + + records + .into_iter() + .map(|record| TimeSync::from(record)) + .collect::>() + }; + // Finally, build up the sled-agent map using the sled agent and // omicron zone rows. A for loop is easier to understand than into_iter // + filter_map + return Result + collect. @@ -3681,6 +3732,7 @@ impl DataStore { sled_agents, clickhouse_keeper_cluster_membership, cockroach_status, + ntp_timesync, }) } } diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 16c116c8e4..a2f35b34ed 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1899,6 +1899,15 @@ table! { } } +table! { + inv_ntp_timesync (inv_collection_id, ip, port) { + inv_collection_id -> Uuid, + ip -> Inet, + port -> Int4, + synced -> Bool, + } +} + /* blueprints */ table! { diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml index 31bf387a08..3d2a220ca5 100644 --- a/nexus/inventory/Cargo.toml +++ b/nexus/inventory/Cargo.toml @@ -23,6 +23,7 @@ id-map.workspace = true iddqd.workspace = true nexus-sled-agent-shared.workspace = true nexus-types.workspace = true +ntp-admin-client.workspace = true omicron-common.workspace = true omicron-uuid-kinds.workspace = true reqwest.workspace = true @@ -31,6 +32,7 @@ sled-agent-client.workspace = true sled-agent-types.workspace = true sled-agent-zone-images-examples.workspace = true slog.workspace = true +slog-error-chain.workspace = true strum.workspace = true swrite.workspace = true thiserror.workspace = true diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index d9fe84ca7a..1534662d69 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -31,6 +31,7 @@ use nexus_types::inventory::RotPageWhich; use nexus_types::inventory::RotState; use nexus_types::inventory::ServiceProcessor; use nexus_types::inventory::SledAgent; +use nexus_types::inventory::TimeSync; use nexus_types::inventory::Zpool; use omicron_cockroach_metrics::CockroachMetric; use omicron_cockroach_metrics::NodeId; @@ -120,6 +121,7 @@ pub struct CollectionBuilder { clickhouse_keeper_cluster_membership: BTreeSet, cockroach_status: BTreeMap, + ntp_timesync: IdOrdMap, // CollectionBuilderRng is taken by value, rather than passed in as a // mutable ref, to encourage a tree-like structure where each RNG is // generally independent. @@ -150,6 +152,7 @@ impl CollectionBuilder { sleds: IdOrdMap::new(), clickhouse_keeper_cluster_membership: BTreeSet::new(), cockroach_status: BTreeMap::new(), + ntp_timesync: IdOrdMap::new(), rng: CollectionBuilderRng::from_entropy(), } } @@ -173,6 +176,7 @@ impl CollectionBuilder { clickhouse_keeper_cluster_membership: self .clickhouse_keeper_cluster_membership, cockroach_status: self.cockroach_status, + ntp_timesync: self.ntp_timesync, } } @@ -550,6 +554,19 @@ impl CollectionBuilder { self.clickhouse_keeper_cluster_membership.insert(membership); } + /// Record information about timesync + pub fn found_ntp_timesync( + &mut self, + timesync: TimeSync, + ) -> Result<(), anyhow::Error> { + self.ntp_timesync + .insert_unique(timesync) + .map_err(|err| err.into_owned()) + .with_context(|| { + format!("NTP service reported time multiple times") + }) + } + /// Record metrics from a CockroachDB node pub fn found_cockroach_metrics( &mut self, @@ -621,6 +638,8 @@ mod test { assert!(collection.cabooses_found.is_empty()); assert!(collection.rot_pages_found.is_empty()); assert!(collection.clickhouse_keeper_cluster_membership.is_empty()); + assert!(collection.cockroach_status.is_empty()); + assert!(collection.ntp_timesync.is_empty()); } // Simple test of a single, fairly typical collection that contains just diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index eac99c6d04..97c5406134 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -19,6 +19,7 @@ use omicron_cockroach_metrics::CockroachClusterAdminClient; use slog::Logger; use slog::o; use slog::{debug, error}; +use std::net::SocketAddrV6; use std::time::Duration; use strum::IntoEnumIterator; @@ -31,6 +32,14 @@ pub struct Collector<'a> { mgs_clients: Vec, keeper_admin_clients: Vec, cockroach_admin_client: &'a CockroachClusterAdminClient, + // We do care about the sled this NTP information is paired with (or Zone + // UUID, we can switch between one or the other with full inventory + // context). + // + // However, internal DNS records do not distinguish between NTP zones. + // To mitigate: we record the address of the client, which can be used to + // identify which sled this queried timesync data came from. + ntp_admin_clients: Vec<(SocketAddrV6, ntp_admin_client::Client)>, sled_agent_lister: &'a (dyn SledAgentEnumerator + Send + Sync), in_progress: CollectionBuilder, } @@ -41,6 +50,7 @@ impl<'a> Collector<'a> { mgs_clients: Vec, keeper_admin_clients: Vec, cockroach_admin_client: &'a CockroachClusterAdminClient, + ntp_admin_clients: Vec<(SocketAddrV6, ntp_admin_client::Client)>, sled_agent_lister: &'a (dyn SledAgentEnumerator + Send + Sync), log: slog::Logger, ) -> Self { @@ -49,6 +59,7 @@ impl<'a> Collector<'a> { mgs_clients, keeper_admin_clients, cockroach_admin_client, + ntp_admin_clients, sled_agent_lister, in_progress: CollectionBuilder::new(creator), } @@ -73,6 +84,7 @@ impl<'a> Collector<'a> { self.collect_all_mgs().await; self.collect_all_sled_agents().await; + self.collect_all_timesync().await; self.collect_all_keepers().await; self.collect_all_cockroach().await; @@ -367,6 +379,55 @@ impl<'a> Collector<'a> { self.in_progress.found_sled_inventory(&sled_agent_url, inventory) } + /// Collect timesync status from all sleds + async fn collect_all_timesync(&mut self) { + for (addr, client) in &self.ntp_admin_clients { + if let Err(err) = Self::collect_one_timesync( + &self.log, + *addr, + client, + &mut self.in_progress, + ) + .await + { + error!( + &self.log, + "timesync collection error"; + "addr" => ?addr, + slog_error_chain::InlineErrorChain::new(err.as_ref()) + ); + } + } + } + + async fn collect_one_timesync( + log: &slog::Logger, + addr: SocketAddrV6, + client: &ntp_admin_client::Client, + in_progress: &mut CollectionBuilder, + ) -> Result<(), anyhow::Error> { + let sled_agent_url = client.baseurl(); + debug!(&log, "begin collection from Sled Agent (timesync)"; + "sled_agent_url" => client.baseurl() + ); + + let maybe_ident = client.timesync().await.with_context(|| { + format!("Sled Agent {:?}: timesync", &sled_agent_url) + }); + let timesync = match maybe_ident { + Ok(timesync) => nexus_types::inventory::TimeSync { + addr: addr, + synced: timesync.into_inner().sync, + }, + Err(error) => { + in_progress.found_error(InventoryError::from(error)); + return Ok(()); + } + }; + + in_progress.found_ntp_timesync(timesync) + } + /// Collect inventory from about keepers from all `ClickhouseAdminKeeper` /// clients async fn collect_all_keepers(&mut self) { @@ -833,6 +894,7 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + let ntp_clients = Vec::new(); // Configure the mock server as a backend for the CockroachDB client let timeout = Duration::from_secs(15); let crdb_cluster = @@ -844,6 +906,7 @@ mod test { vec![mgs_client], keeper_clients, &crdb_cluster, + ntp_clients, &sled_enum, log.clone(), ); @@ -916,6 +979,7 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + let ntp_clients = Vec::new(); let timeout = Duration::from_secs(15); let crdb_cluster = CockroachClusterAdminClient::new(log.clone(), timeout); @@ -926,6 +990,7 @@ mod test { mgs_clients, keeper_clients, &crdb_cluster, + ntp_clients, &sled_enum, log.clone(), ); @@ -969,6 +1034,7 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + let ntp_clients = Vec::new(); let timeout = Duration::from_secs(15); let crdb_cluster = CockroachClusterAdminClient::new(log.clone(), timeout); @@ -979,6 +1045,7 @@ mod test { mgs_clients, keeper_clients, &crdb_cluster, + ntp_clients, &sled_enum, log.clone(), ); @@ -1027,6 +1094,7 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + let ntp_clients = Vec::new(); let timeout = Duration::from_secs(15); let crdb_cluster = CockroachClusterAdminClient::new(log.clone(), timeout); @@ -1037,6 +1105,7 @@ mod test { vec![mgs_client], keeper_clients, &crdb_cluster, + ntp_clients, &sled_enum, log.clone(), ); diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index a1c71f7fc4..c02d7173d9 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -623,6 +623,18 @@ pub fn representative() -> Representative { }, ); + builder + .found_ntp_timesync(nexus_types::inventory::TimeSync { + addr: std::net::SocketAddrV6::new( + std::net::Ipv6Addr::LOCALHOST, + 0, + 0, + 0, + ), + synced: true, + }) + .unwrap(); + Representative { builder, sleds: [sled1_bb, sled2_bb, sled3_bb, sled4_bb], diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index 9d1b1f5979..906b6a8aa6 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -196,6 +196,30 @@ async fn inventory_activate( }, }; + // Find ntp-admin servers if there are any. + let boundary_ntp_admin_addrs = resolver + .lookup_all_socket_v6(ServiceName::BoundaryNtp) + .await + .context("looking up boundary NTP addresses")?; + let internal_ntp_admin_addrs = resolver + .lookup_all_socket_v6(ServiceName::InternalNtp) + .await + .context("looking up internal NTP addresses")?; + + let ntp_admin_clients = boundary_ntp_admin_addrs + .into_iter() + .chain(internal_ntp_admin_addrs.into_iter()) + .map(|mut addr| { + // TODO(https://github.com/oxidecomputer/omicron/issues/8602): + // If we could look up all NTP Admin services, we could skip + // this port hard-coding. + addr.set_port(omicron_common::address::NTP_ADMIN_PORT); + let url = format!("http://{}", addr); + let log = opctx.log.new(o!("ntp_admin_url" => url.clone())); + (addr, ntp_admin_client::Client::new(&url, log)) + }) + .collect(); + // Update CockroachDB cluster backends. let cockroach_addresses = resolver .lookup_all_socket_v6(ServiceName::Cockroach) @@ -224,6 +248,7 @@ async fn inventory_activate( mgs_clients, keeper_admin_clients, cockroach_admin_client, + ntp_admin_clients, &sled_enum, opctx.log.clone(), ); diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index c71136a00f..f5b92017ef 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -153,6 +153,9 @@ pub struct Collection { /// The status of our cockroachdb cluster, keyed by node identifier pub cockroach_status: BTreeMap, + + /// The status of time synchronization + pub ntp_timesync: IdOrdMap, } impl Collection { @@ -610,3 +613,21 @@ pub struct CockroachStatus { pub ranges_underreplicated: Option, pub liveness_live_nodes: Option, } + +/// Inventory representation of whether an NTP service reports time to be synced +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)] +pub struct TimeSync { + /// Address of the NTP admin server contacted + pub addr: SocketAddrV6, + + /// Whether or not the service claims time is synchronized + pub synced: bool, +} + +impl IdOrdItem for TimeSync { + type Key<'a> = SocketAddrV6; + fn key(&self) -> Self::Key<'_> { + self.addr + } + id_upcast!(); +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9b90c2b8e8..cd3f8506b4 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4258,6 +4258,15 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_cockroachdb_status ( PRIMARY KEY (inv_collection_id, node_id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_ntp_timesync ( + inv_collection_id UUID NOT NULL, + ip INET NOT NULL, + port INT4 NOT NULL CHECK (port BETWEEN 0 AND 65535), + synced BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, ip, port) +); + /* * Various runtime configuration switches for reconfigurator *