Skip to content

Use NTP admin API instead of sled agent, remove sled agent time_sync API #8597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: ntp-admin
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions dev-tools/ls-apis/tests/api_dependencies.out
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ Nexus Internal API (client: nexus-client)
consumed by: propolis-server (propolis/bin/propolis-server) via 3 paths

NTP Admin (client: ntp-admin-client)
consumed by: omicron-sled-agent (omicron/sled-agent) via 2 paths

External API (client: oxide-client)

Expand Down
67 changes: 0 additions & 67 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -1464,29 +1464,6 @@
}
}
},
"/timesync": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there's any documentation that relies on checking the response from this endpoint in the logs to debug an NTP sync issue or something similar. I have a vague memory of something like that. If anyone remembers, that documentation should be updated. Perhaps an announcement on Matrix might be good as well? I've certainly used it before if an omicron deployment was taking too long to start.

"get": {
"operationId": "timesync_get",
"responses": {
"200": {
"description": "successful operation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TimeSync"
}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/v2p": {
"get": {
"summary": "List v2p mappings present on sled",
Expand Down Expand Up @@ -7326,50 +7303,6 @@
"uplinks"
]
},
"TimeSync": {
"type": "object",
"properties": {
"correction": {
"description": "The current offset between the NTP clock and system clock.",
"type": "number",
"format": "double"
},
"ip_addr": {
"description": "The NTP reference IP address.",
"type": "string",
"format": "ip"
},
"ref_id": {
"description": "The NTP reference ID.",
"type": "integer",
"format": "uint32",
"minimum": 0
},
"ref_time": {
"description": "The NTP reference time (i.e. what chrony thinks the current time is, not necessarily the current system time).",
"type": "number",
"format": "double"
},
"stratum": {
"description": "The NTP stratum (our upstream's stratum plus one).",
"type": "integer",
"format": "uint8",
"minimum": 0
},
"sync": {
"description": "The synchronization state of the sled, true when the system clock and the NTP clock are in sync (to within a small window).",
"type": "boolean"
}
},
"required": [
"correction",
"ip_addr",
"ref_id",
"ref_time",
"stratum",
"sync"
]
},
"TxEqConfig": {
"description": "Per-port tx-eq overrides. This can be used to fine-tune the transceiver equalization settings to improve signal integrity.",
"type": "object",
Expand Down
1 change: 1 addition & 0 deletions sled-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ nexus-client.workspace = true
nexus-config.workspace = true
nexus-sled-agent-shared.workspace = true
nexus-types.workspace = true
ntp-admin-client.workspace = true
omicron-common.workspace = true
omicron-ddm-admin-client.workspace = true
omicron-uuid-kinds.workspace = true
Expand Down
9 changes: 0 additions & 9 deletions sled-agent/api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ use sled_agent_types::{
VmmPutStateResponse, VmmUnregisterResponse,
},
sled::AddSledRequest,
time_sync::TimeSync,
zone_bundle::{
BundleUtilization, CleanupContext, CleanupCount, PriorityOrder,
ZoneBundleId, ZoneBundleMetadata,
Expand Down Expand Up @@ -477,14 +476,6 @@ pub trait SledAgentApi {
rqctx: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<VirtualNetworkInterfaceHost>>, HttpError>;

#[endpoint {
method = GET,
path = "/timesync",
}]
async fn timesync_get(
rqctx: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<TimeSync>, HttpError>;

#[endpoint {
method = POST,
path = "/switch-ports",
Expand Down
1 change: 1 addition & 0 deletions sled-agent/config-reconciler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ illumos-utils.workspace = true
installinator-common.workspace = true
key-manager.workspace = true
nexus-sled-agent-shared.workspace = true
ntp-admin-client.workspace = true
omicron-common.workspace = true
omicron-uuid-kinds.workspace = true
serde.workspace = true
Expand Down
2 changes: 1 addition & 1 deletion sled-agent/config-reconciler/src/reconciler_task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ impl ReconcilerTask {

// Collect the current timesync status (needed to start any new zones,
// and also we want to report it as part of each reconciler result).
let timesync_status = self.zones.check_timesync().await;
let timesync_status = self.zones.check_timesync(&self.log).await;

// Call back into sled-agent and let it do any work that needs to happen
// once time is sync'd (e.g., rewrite `uptime`).
Expand Down
96 changes: 29 additions & 67 deletions sled-agent/config-reconciler/src/reconciler_task/zones.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
use nexus_sled_agent_shared::inventory::OmicronZoneConfig;
use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
use nexus_sled_agent_shared::inventory::OmicronZoneType;
use ntp_admin_client::types::TimeSync;
use omicron_common::address::Ipv6Subnet;
use omicron_uuid_kinds::OmicronZoneUuid;
use sled_agent_types::time_sync::TimeSync;
use sled_agent_types::zone_bundle::ZoneBundleCause;
use sled_agent_types::zone_images::ResolverStatus;
use sled_storage::config::MountConfig;
Expand All @@ -36,10 +36,7 @@ use slog::warn;
use slog_error_chain::InlineErrorChain;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::net::IpAddr;
use std::net::Ipv6Addr;
use std::num::NonZeroUsize;
use std::str::FromStr as _;
use std::sync::Arc;

use super::OmicronDatasets;
Expand Down Expand Up @@ -70,6 +67,8 @@ pub enum TimeSyncError {
NoRunningNtpZone,
#[error("multiple running NTP zones - this should never happen!")]
MultipleRunningNtpZones,
#[error("failed to communicate with NTP admin server")]
NtpAdmin(#[from] ntp_admin_client::Error<ntp_admin_client::types::Error>),
#[error("failed to execute chronyc within NTP zone")]
ExecuteChronyc(#[source] RunCommandError),
#[error(
Expand Down Expand Up @@ -465,10 +464,10 @@ impl OmicronZones {
}

/// Check the timesync status from a running NTP zone (if it exists)
pub(super) async fn check_timesync(&self) -> TimeSyncStatus {
pub(super) async fn check_timesync(&self, log: &Logger) -> TimeSyncStatus {
match &self.timesync_config {
TimeSyncConfig::Normal => {
match self.timesync_status_from_ntp_zone().await {
match self.timesync_status_from_ntp_zone(log).await {
Ok(timesync) => TimeSyncStatus::TimeSync(timesync),
Err(err) => {
TimeSyncStatus::FailedToGetSyncStatus(Arc::new(err))
Expand All @@ -481,83 +480,45 @@ impl OmicronZones {

async fn timesync_status_from_ntp_zone(
&self,
log: &Logger,
) -> Result<TimeSync, TimeSyncError> {
// Get the one and only running NTP zone, or return an error.
let mut running_ntp_zones = self.zones.iter().filter_map(|z| {
let mut ntp_admin_addresses = self.zones.iter().filter_map(|z| {
if !z.config.zone_type.is_ntp() {
return None;
}
// TODO(https://github.com/oxidecomputer/omicron/issues/6796):
//
// We could avoid hard-coding the port here if the zone was fully
// specified to include both NTP and Admin server addresses.
let mut addr = match z.config.zone_type {
OmicronZoneType::BoundaryNtp { address, .. } => address,
OmicronZoneType::InternalNtp { address, .. } => address,
_ => return None,
};
addr.set_port(omicron_common::address::NTP_ADMIN_PORT);

match &z.state {
ZoneState::Running(running_zone) => Some(running_zone),
ZoneState::Running(_) => Some(addr),
ZoneState::PartiallyShutDown { .. }
| ZoneState::FailedToStart(_) => None,
}
});
let running_ntp_zone =
running_ntp_zones.next().ok_or(TimeSyncError::NoRunningNtpZone)?;
if running_ntp_zones.next().is_some() {
let ntp_admin_address = ntp_admin_addresses
.next()
.ok_or(TimeSyncError::NoRunningNtpZone)?;
if ntp_admin_addresses.next().is_some() {
return Err(TimeSyncError::MultipleRunningNtpZones);
}

// XXXNTP - This could be replaced with a direct connection to the
// daemon using a patched version of the chrony_candm crate to allow
// a custom server socket path. From the GZ, it should be possible to
// connect to the UNIX socket at
// format!("{}/var/run/chrony/chronyd.sock", ntp_zone.root())

let stdout = running_ntp_zone
.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"])
.map_err(TimeSyncError::ExecuteChronyc)?;

let v: Vec<&str> = stdout.split(',').collect();

if v.len() < 10 {
return Err(TimeSyncError::FailedToParse {
reason: "too few fields",
stdout,
});
}

let Ok(ref_id) = u32::from_str_radix(v[0], 16) else {
return Err(TimeSyncError::FailedToParse {
reason: "bad ref_id",
stdout,
});
};
let ip_addr =
IpAddr::from_str(v[1]).unwrap_or(Ipv6Addr::UNSPECIFIED.into());
let Ok(stratum) = u8::from_str(v[2]) else {
return Err(TimeSyncError::FailedToParse {
reason: "bad stratum",
stdout,
});
};
let Ok(ref_time) = f64::from_str(v[3]) else {
return Err(TimeSyncError::FailedToParse {
reason: "bad ref_time",
stdout,
});
};
let Ok(correction) = f64::from_str(v[4]) else {
return Err(TimeSyncError::FailedToParse {
reason: "bad correction",
stdout,
});
};

// Per `chronyc waitsync`'s implementation, if either the
// reference IP address is not unspecified or the reference
// ID is not 0 or 0x7f7f0101, we are synchronized to a peer.
let peer_sync =
!ip_addr.is_unspecified() || (ref_id != 0 && ref_id != 0x7f7f0101);
let client = ntp_admin_client::Client::new(
&format!("http://{ntp_admin_address}"),
log.clone(),
);

let sync = stratum < 10
&& ref_time > 1234567890.0
&& peer_sync
&& correction.abs() <= 0.05;
let timesync = client.timesync().await?.into_inner();

Ok(TimeSync { sync, ref_id, ip_addr, stratum, ref_time, correction })
Ok(timesync)
}
}

Expand Down Expand Up @@ -1111,6 +1072,7 @@ mod tests {
use sled_agent_types::zone_images::ZoneManifestStatus;
use std::collections::BTreeSet;
use std::collections::VecDeque;
use std::net::Ipv6Addr;
use std::sync::Mutex;
use tufaceous_artifact::ArtifactHash;

Expand Down
8 changes: 0 additions & 8 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ use sled_agent_types::instance::{
VmmPutStateResponse, VmmUnregisterResponse,
};
use sled_agent_types::sled::AddSledRequest;
use sled_agent_types::time_sync::TimeSync;
use sled_agent_types::zone_bundle::{
BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod,
StorageLimit, ZoneBundleId, ZoneBundleMetadata,
Expand Down Expand Up @@ -708,13 +707,6 @@ impl SledAgentApi for SledAgentImpl {
Ok(HttpResponseOk(vnics))
}

async fn timesync_get(
rqctx: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<TimeSync>, HttpError> {
let sa = rqctx.context();
Ok(HttpResponseOk(sa.timesync_get().await.map_err(|e| Error::from(e))?))
}

async fn uplink_ensure(
rqctx: RequestContext<Self::Context>,
body: TypedBody<SwitchPorts>,
Expand Down
Loading
Loading