diff --git a/Cargo.lock b/Cargo.lock index 9db98afdd4..58be17f8c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8144,6 +8144,7 @@ dependencies = [ "nexus-reconfigurator-blippy", "nexus-sled-agent-shared", "nexus-types", + "ntp-admin-client", "omicron-common", "omicron-ddm-admin-client", "omicron-test-utils", @@ -12116,6 +12117,7 @@ dependencies = [ "installinator-common", "key-manager", "nexus-sled-agent-shared", + "ntp-admin-client", "omicron-common", "omicron-test-utils", "omicron-uuid-kinds", diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index 999aedff62..2919ac04aa 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -71,6 +71,7 @@ Nexus Internal API (client: nexus-client) consumed by: propolis-server (propolis/bin/propolis-server) via 3 paths NTP Admin (client: ntp-admin-client) + consumed by: omicron-sled-agent (omicron/sled-agent) via 2 paths External API (client: oxide-client) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 43f4089326..62b2817838 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1464,29 +1464,6 @@ } } }, - "/timesync": { - "get": { - "operationId": "timesync_get", - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/TimeSync" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/v2p": { "get": { "summary": "List v2p mappings present on sled", @@ -7326,50 +7303,6 @@ "uplinks" ] }, - "TimeSync": { - "type": "object", - "properties": { - "correction": { - "description": "The current offset between the NTP clock and system clock.", - "type": "number", - "format": "double" - }, - "ip_addr": { - "description": "The NTP reference IP address.", - "type": "string", - "format": "ip" - }, - "ref_id": { - "description": "The NTP reference ID.", - "type": "integer", - "format": "uint32", - "minimum": 0 - }, - "ref_time": { - "description": "The NTP reference time (i.e. what chrony thinks the current time is, not necessarily the current system time).", - "type": "number", - "format": "double" - }, - "stratum": { - "description": "The NTP stratum (our upstream's stratum plus one).", - "type": "integer", - "format": "uint8", - "minimum": 0 - }, - "sync": { - "description": "The synchronization state of the sled, true when the system clock and the NTP clock are in sync (to within a small window).", - "type": "boolean" - } - }, - "required": [ - "correction", - "ip_addr", - "ref_id", - "ref_time", - "stratum", - "sync" - ] - }, "TxEqConfig": { "description": "Per-port tx-eq overrides. This can be used to fine-tune the transceiver equalization settings to improve signal integrity.", "type": "object", diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 94d662da43..7bbcc7495a 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -59,6 +59,7 @@ nexus-client.workspace = true nexus-config.workspace = true nexus-sled-agent-shared.workspace = true nexus-types.workspace = true +ntp-admin-client.workspace = true omicron-common.workspace = true omicron-ddm-admin-client.workspace = true omicron-uuid-kinds.workspace = true diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 1dffb0da10..650cc04b01 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -42,7 +42,6 @@ use sled_agent_types::{ VmmPutStateResponse, VmmUnregisterResponse, }, sled::AddSledRequest, - time_sync::TimeSync, zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, PriorityOrder, ZoneBundleId, ZoneBundleMetadata, @@ -477,14 +476,6 @@ pub trait SledAgentApi { rqctx: RequestContext, ) -> Result>, HttpError>; - #[endpoint { - method = GET, - path = "/timesync", - }] - async fn timesync_get( - rqctx: RequestContext, - ) -> Result, HttpError>; - #[endpoint { method = POST, path = "/switch-ports", diff --git a/sled-agent/config-reconciler/Cargo.toml b/sled-agent/config-reconciler/Cargo.toml index 1d061ee66e..e0723007e0 100644 --- a/sled-agent/config-reconciler/Cargo.toml +++ b/sled-agent/config-reconciler/Cargo.toml @@ -26,6 +26,7 @@ illumos-utils.workspace = true installinator-common.workspace = true key-manager.workspace = true nexus-sled-agent-shared.workspace = true +ntp-admin-client.workspace = true omicron-common.workspace = true omicron-uuid-kinds.workspace = true serde.workspace = true diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index 33bc8f7dc1..10e7acaeb7 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -503,7 +503,7 @@ impl ReconcilerTask { // Collect the current timesync status (needed to start any new zones, // and also we want to report it as part of each reconciler result). - let timesync_status = self.zones.check_timesync().await; + let timesync_status = self.zones.check_timesync(&self.log).await; // Call back into sled-agent and let it do any work that needs to happen // once time is sync'd (e.g., rewrite `uptime`). diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 76e0f6f11b..a51802b22d 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -24,9 +24,9 @@ use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use nexus_sled_agent_shared::inventory::OmicronZoneType; +use ntp_admin_client::types::TimeSync; use omicron_common::address::Ipv6Subnet; use omicron_uuid_kinds::OmicronZoneUuid; -use sled_agent_types::time_sync::TimeSync; use sled_agent_types::zone_bundle::ZoneBundleCause; use sled_agent_types::zone_images::ResolverStatus; use sled_storage::config::MountConfig; @@ -36,10 +36,7 @@ use slog::warn; use slog_error_chain::InlineErrorChain; use std::borrow::Cow; use std::collections::BTreeMap; -use std::net::IpAddr; -use std::net::Ipv6Addr; use std::num::NonZeroUsize; -use std::str::FromStr as _; use std::sync::Arc; use super::OmicronDatasets; @@ -70,6 +67,8 @@ pub enum TimeSyncError { NoRunningNtpZone, #[error("multiple running NTP zones - this should never happen!")] MultipleRunningNtpZones, + #[error("failed to communicate with NTP admin server")] + NtpAdmin(#[from] ntp_admin_client::Error), #[error("failed to execute chronyc within NTP zone")] ExecuteChronyc(#[source] RunCommandError), #[error( @@ -465,10 +464,10 @@ impl OmicronZones { } /// Check the timesync status from a running NTP zone (if it exists) - pub(super) async fn check_timesync(&self) -> TimeSyncStatus { + pub(super) async fn check_timesync(&self, log: &Logger) -> TimeSyncStatus { match &self.timesync_config { TimeSyncConfig::Normal => { - match self.timesync_status_from_ntp_zone().await { + match self.timesync_status_from_ntp_zone(log).await { Ok(timesync) => TimeSyncStatus::TimeSync(timesync), Err(err) => { TimeSyncStatus::FailedToGetSyncStatus(Arc::new(err)) @@ -481,83 +480,45 @@ impl OmicronZones { async fn timesync_status_from_ntp_zone( &self, + log: &Logger, ) -> Result { // Get the one and only running NTP zone, or return an error. - let mut running_ntp_zones = self.zones.iter().filter_map(|z| { + let mut ntp_admin_addresses = self.zones.iter().filter_map(|z| { if !z.config.zone_type.is_ntp() { return None; } + // TODO(https://github.com/oxidecomputer/omicron/issues/6796): + // + // We could avoid hard-coding the port here if the zone was fully + // specified to include both NTP and Admin server addresses. + let mut addr = match z.config.zone_type { + OmicronZoneType::BoundaryNtp { address, .. } => address, + OmicronZoneType::InternalNtp { address, .. } => address, + _ => return None, + }; + addr.set_port(omicron_common::address::NTP_ADMIN_PORT); match &z.state { - ZoneState::Running(running_zone) => Some(running_zone), + ZoneState::Running(_) => Some(addr), ZoneState::PartiallyShutDown { .. } | ZoneState::FailedToStart(_) => None, } }); - let running_ntp_zone = - running_ntp_zones.next().ok_or(TimeSyncError::NoRunningNtpZone)?; - if running_ntp_zones.next().is_some() { + let ntp_admin_address = ntp_admin_addresses + .next() + .ok_or(TimeSyncError::NoRunningNtpZone)?; + if ntp_admin_addresses.next().is_some() { return Err(TimeSyncError::MultipleRunningNtpZones); } - // XXXNTP - This could be replaced with a direct connection to the - // daemon using a patched version of the chrony_candm crate to allow - // a custom server socket path. From the GZ, it should be possible to - // connect to the UNIX socket at - // format!("{}/var/run/chrony/chronyd.sock", ntp_zone.root()) - - let stdout = running_ntp_zone - .run_cmd(&["/usr/bin/chronyc", "-c", "tracking"]) - .map_err(TimeSyncError::ExecuteChronyc)?; - - let v: Vec<&str> = stdout.split(',').collect(); - - if v.len() < 10 { - return Err(TimeSyncError::FailedToParse { - reason: "too few fields", - stdout, - }); - } - - let Ok(ref_id) = u32::from_str_radix(v[0], 16) else { - return Err(TimeSyncError::FailedToParse { - reason: "bad ref_id", - stdout, - }); - }; - let ip_addr = - IpAddr::from_str(v[1]).unwrap_or(Ipv6Addr::UNSPECIFIED.into()); - let Ok(stratum) = u8::from_str(v[2]) else { - return Err(TimeSyncError::FailedToParse { - reason: "bad stratum", - stdout, - }); - }; - let Ok(ref_time) = f64::from_str(v[3]) else { - return Err(TimeSyncError::FailedToParse { - reason: "bad ref_time", - stdout, - }); - }; - let Ok(correction) = f64::from_str(v[4]) else { - return Err(TimeSyncError::FailedToParse { - reason: "bad correction", - stdout, - }); - }; - - // Per `chronyc waitsync`'s implementation, if either the - // reference IP address is not unspecified or the reference - // ID is not 0 or 0x7f7f0101, we are synchronized to a peer. - let peer_sync = - !ip_addr.is_unspecified() || (ref_id != 0 && ref_id != 0x7f7f0101); + let client = ntp_admin_client::Client::new( + &format!("http://{ntp_admin_address}"), + log.clone(), + ); - let sync = stratum < 10 - && ref_time > 1234567890.0 - && peer_sync - && correction.abs() <= 0.05; + let timesync = client.timesync().await?.into_inner(); - Ok(TimeSync { sync, ref_id, ip_addr, stratum, ref_time, correction }) + Ok(timesync) } } @@ -1111,6 +1072,7 @@ mod tests { use sled_agent_types::zone_images::ZoneManifestStatus; use std::collections::BTreeSet; use std::collections::VecDeque; + use std::net::Ipv6Addr; use std::sync::Mutex; use tufaceous_artifact::ArtifactHash; diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index e196ebbe57..cd51cb39e6 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -36,7 +36,6 @@ use sled_agent_types::instance::{ VmmPutStateResponse, VmmUnregisterResponse, }; use sled_agent_types::sled::AddSledRequest; -use sled_agent_types::time_sync::TimeSync; use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, StorageLimit, ZoneBundleId, ZoneBundleMetadata, @@ -708,13 +707,6 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseOk(vnics)) } - async fn timesync_get( - rqctx: RequestContext, - ) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.timesync_get().await.map_err(|e| Error::from(e))?)) - } - async fn uplink_ensure( rqctx: RequestContext, body: TypedBody, diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index f985a1b53f..2327e01df1 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -101,7 +101,12 @@ use nexus_types::deployment::{ BlueprintSledConfig, OximeterReadMode, PendingMgsUpdates, }; use nexus_types::external_api::views::SledState; -use omicron_common::address::{COCKROACH_ADMIN_PORT, get_sled_address}; +use ntp_admin_client::{ + Client as NtpAdminClient, Error as NtpAdminError, types::TimeSync, +}; +use omicron_common::address::{ + COCKROACH_ADMIN_PORT, NTP_ADMIN_PORT, get_sled_address, +}; use omicron_common::api::external::Generation; use omicron_common::api::internal::shared::ExternalPortDiscovery; use omicron_common::api::internal::shared::LldpAdminStatus; @@ -128,7 +133,6 @@ use sled_agent_types::rack_init::{ }; use sled_agent_types::rack_ops::RssStep; use sled_agent_types::sled::StartSledAgentRequest; -use sled_agent_types::time_sync::TimeSync; use sled_hardware_types::underlay::BootstrapInterface; use slog::Logger; use slog_error_chain::InlineErrorChain; @@ -208,6 +212,9 @@ pub enum SetupServiceError { #[error("Error making HTTP request to Nexus: {0}")] NexusApi(#[from] NexusError), + #[error("Error making HTTP request to NTP Admin Server")] + NtpAdminApi(#[from] NtpAdminError), + #[error("Error contacting ddmd: {0}")] DdmError(#[from] DdmError), @@ -692,7 +699,7 @@ impl ServiceInner { async fn sled_timesync( &self, - sled_address: &SocketAddrV6, + address: &SocketAddrV6, ) -> Result { let dur = std::time::Duration::from_secs(60); @@ -701,18 +708,15 @@ impl ServiceInner { .timeout(dur) .build() .map_err(SetupServiceError::HttpClient)?; - let client = SledAgentClient::new_with_client( - &format!("http://{}", sled_address), + let client = NtpAdminClient::new_with_client( + &format!("http://{}", address), client, - self.log.new(o!("SledAgentClient" => sled_address.to_string())), + self.log.new(o!("NtpAdminClient" => address.to_string())), ); - info!( - self.log, - "Checking time synchronization for {}...", sled_address - ); + info!(self.log, "Checking time synchronization for {}...", address); - let ts = client.timesync_get().await?.into_inner(); + let ts = client.timesync().await?.into_inner(); Ok(TimeSync { sync: ts.sync, ref_id: ts.ref_id, @@ -725,7 +729,7 @@ impl ServiceInner { async fn wait_for_timesync( &self, - sled_addresses: &Vec, + ntp_admin_addresses: &Vec, ) -> Result<(), SetupServiceError> { info!(self.log, "Waiting for rack time synchronization"); @@ -733,9 +737,12 @@ impl ServiceInner { let mut synced_peers = 0; let mut sync = true; - for sled_address in sled_addresses { - if let Ok(ts) = self.sled_timesync(sled_address).await { - info!(self.log, "Timesync for {} {:?}", sled_address, ts); + for ntp_admin_address in ntp_admin_addresses { + if let Ok(ts) = self.sled_timesync(ntp_admin_address).await { + info!( + self.log, + "Timesync for {} {:?}", ntp_admin_address, ts + ); if !ts.sync { sync = false; } else { @@ -752,7 +759,7 @@ impl ServiceInner { Err(BackoffError::transient(format!( "Time is synchronized on {}/{} sleds", synced_peers, - sled_addresses.len() + ntp_admin_addresses.len() ))) } }; @@ -1364,14 +1371,34 @@ impl ServiceInner { // Wait until time is synchronized on all sleds before proceeding. rss_step.update(RssStep::WaitForTimeSync); - let sled_addresses: Vec<_> = sled_plan - .sleds - .values() - .map(|initialization_request| { - get_sled_address(initialization_request.body.subnet) + let ntp_addresses: Vec<_> = service_plan + .services + .iter() + .flat_map(|(_, sled_config)| { + sled_config.zones.iter().filter_map(|zone_config| { + match &zone_config.zone_type { + BlueprintZoneType::BoundaryNtp( + blueprint_zone_type::BoundaryNtp { + address, .. + }, + ) => { + let mut ntp_admin_addr = *address; + ntp_admin_addr.set_port(NTP_ADMIN_PORT); + Some(ntp_admin_addr) + } + BlueprintZoneType::InternalNtp( + blueprint_zone_type::InternalNtp { address }, + ) => { + let mut ntp_admin_addr = *address; + ntp_admin_addr.set_port(NTP_ADMIN_PORT); + Some(ntp_admin_addr) + } + _ => None, + } + }) }) .collect(); - self.wait_for_timesync(&sled_addresses).await?; + self.wait_for_timesync(&ntp_addresses).await?; info!(self.log, "Finished setting up Internal DNS and NTP"); diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 0f198d18ac..335efcd297 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -47,7 +47,6 @@ use sled_agent_types::instance::VmmPutStateBody; use sled_agent_types::instance::VmmPutStateResponse; use sled_agent_types::instance::VmmUnregisterResponse; use sled_agent_types::sled::AddSledRequest; -use sled_agent_types::time_sync::TimeSync; use sled_agent_types::zone_bundle::BundleUtilization; use sled_agent_types::zone_bundle::CleanupContext; use sled_agent_types::zone_bundle::CleanupCount; @@ -686,12 +685,6 @@ impl SledAgentApi for SledAgentSimImpl { method_unimplemented() } - async fn timesync_get( - _rqctx: RequestContext, - ) -> Result, HttpError> { - method_unimplemented() - } - async fn sled_identifiers( _rqctx: RequestContext, ) -> Result, HttpError> { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index c30bf21e98..f38012e264 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -53,7 +53,7 @@ use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid}; use sled_agent_config_reconciler::{ ConfigReconcilerHandle, ConfigReconcilerSpawnToken, InternalDisksReceiver, LedgerNewConfigError, LedgerTaskError, ReconcilerInventory, - SledAgentArtifactStore, SledAgentFacilities, TimeSyncStatus, + SledAgentArtifactStore, SledAgentFacilities, }; use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfig; @@ -62,7 +62,6 @@ use sled_agent_types::instance::{ VmmStateRequested, VmmUnregisterResponse, }; use sled_agent_types::sled::{BaseboardId, StartSledAgentRequest}; -use sled_agent_types::time_sync::TimeSync; use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, PriorityOrder, StorageLimit, ZoneBundleCause, ZoneBundleMetadata, @@ -999,22 +998,6 @@ impl SledAgent { .map_err(Error::from) } - /// Gets the sled's current time synchronization state - pub async fn timesync_get(&self) -> Result { - let status = self.inner.config_reconciler.timesync_status(); - - // TODO-cleanup we could give a more specific error cause in the - // `FailedToGetSyncStatus` case. - match status { - TimeSyncStatus::NotYetChecked - | TimeSyncStatus::ConfiguredToSkip - | TimeSyncStatus::FailedToGetSyncStatus(_) => { - Err(Error::TimeNotSynchronized) - } - TimeSyncStatus::TimeSync(time_sync) => Ok(time_sync), - } - } - pub async fn ensure_scrimlet_host_ports( &self, uplinks: Vec, diff --git a/sled-agent/types/src/lib.rs b/sled-agent/types/src/lib.rs index 172bbed8a3..b6cf51b40d 100644 --- a/sled-agent/types/src/lib.rs +++ b/sled-agent/types/src/lib.rs @@ -14,6 +14,5 @@ pub mod rack_init; pub mod rack_ops; pub mod sled; pub mod support_bundle; -pub mod time_sync; pub mod zone_bundle; pub mod zone_images; diff --git a/sled-agent/types/src/time_sync.rs b/sled-agent/types/src/time_sync.rs deleted file mode 100644 index 1a26d63e6c..0000000000 --- a/sled-agent/types/src/time_sync.rs +++ /dev/null @@ -1,31 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use std::net::IpAddr; - -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; - -// TODO: Remove me, in favor of ntp-admin/types/src/lib.rs -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct TimeSync { - /// The synchronization state of the sled, true when the system clock - /// and the NTP clock are in sync (to within a small window). - pub sync: bool, - /// The NTP reference ID. - pub ref_id: u32, - /// The NTP reference IP address. - pub ip_addr: IpAddr, - /// The NTP stratum (our upstream's stratum plus one). - pub stratum: u8, - /// The NTP reference time (i.e. what chrony thinks the current time is, not - /// necessarily the current system time). - pub ref_time: f64, - // This could be f32, but there is a problem with progenitor/typify - // where, although the f32 correctly becomes "float" (and not "double") in - // the API spec, that "float" gets converted back to f64 when generating - // the client. - /// The current offset between the NTP clock and system clock. - pub correction: f64, -}