Skip to content

sled-agent: validate host phase 2 artifacts when ledgering configs (PR 2/4) #8539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions nexus-sled-agent-shared/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,16 @@ pub enum HostPhase2DesiredContents {
Artifact { hash: ArtifactHash },
}

impl HostPhase2DesiredContents {
/// The artifact hash described by `self`, if it has one.
pub fn artifact_hash(&self) -> Option<ArtifactHash> {
match self {
Self::CurrentContents => None,
Self::Artifact { hash } => Some(*hash),
}
}
}

/// Describes the desired contents for both host phase 2 slots.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
Expand Down
166 changes: 116 additions & 50 deletions sled-agent/config-reconciler/src/ledger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use slog::error;
use slog::info;
use slog::warn;
use slog_error_chain::InlineErrorChain;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::convert::Infallible;
use std::ops::Deref as _;
use tokio::sync::mpsc;
Expand Down Expand Up @@ -98,7 +98,7 @@ pub enum LedgerArtifactConfigError {
"Artifacts in use by ledgered sled config are not present \
in new artifact config: {0:?}"
)]
InUseArtifactedMissing(BTreeMap<ArtifactHash, &'static str>),
InUseArtifactedMissing(BTreeSet<ArtifactHash>),
}

#[derive(Debug, Clone, PartialEq, Eq)]
Expand Down Expand Up @@ -442,10 +442,7 @@ impl<T: SledAgentArtifactStore> LedgerTask<T> {
// thing we confirm is that any referenced artifacts are present in the
// artifact store.
let mut artifact_validation_errors = Vec::new();
for zone in &new_config.zones {
let Some(artifact_hash) = zone.image_source.artifact_hash() else {
continue;
};
for artifact_hash in config_artifact_hashes(new_config) {
match self
.artifact_store
.validate_artifact_exists_in_storage(artifact_hash)
Expand Down Expand Up @@ -491,12 +488,10 @@ impl<T: SledAgentArtifactStore> LedgerTask<T> {
CurrentSledConfig::Ledgered(sled_config) => sled_config,
};

let mut missing = BTreeMap::new();
for zone in &sled_config.zones {
if let Some(hash) = zone.image_source.artifact_hash() {
if !new_config.artifacts.contains(&hash) {
missing.insert(hash, "current zone configuration");
}
let mut missing = BTreeSet::new();
for artifact_hash in config_artifact_hashes(sled_config) {
if !new_config.artifacts.contains(&artifact_hash) {
missing.insert(artifact_hash);
}
}

Expand Down Expand Up @@ -589,6 +584,18 @@ impl<T: SledAgentArtifactStore> LedgerTask<T> {
}
}

// Helper to return all the `ArtifactHash`es contained in a config.
fn config_artifact_hashes(
config: &OmicronSledConfig,
) -> impl Iterator<Item = ArtifactHash> + '_ {
config
.zones
.iter()
.filter_map(|zone| zone.image_source.artifact_hash())
.chain(config.host_phase_2.slot_a.artifact_hash())
.chain(config.host_phase_2.slot_b.artifact_hash())
}

async fn load_sled_config(
config_datasets: &[Utf8PathBuf],
log: &Logger,
Expand Down Expand Up @@ -648,6 +655,7 @@ mod tests {
use camino_tempfile::Utf8TempDir;
use id_map::IdMap;
use illumos_utils::zpool::ZpoolName;
use nexus_sled_agent_shared::inventory::HostPhase2DesiredContents;
use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots;
use nexus_sled_agent_shared::inventory::OmicronZoneConfig;
use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
Expand Down Expand Up @@ -1054,7 +1062,7 @@ mod tests {

// Create a config that references a zone with a different artifact
// hash.
let config = OmicronSledConfig {
let mut config = OmicronSledConfig {
generation: Generation::new().next(),
disks: IdMap::new(),
datasets: IdMap::new(),
Expand All @@ -1070,7 +1078,7 @@ mod tests {
// The ledger task should reject this config due to a missing artifact.
let err = test_harness
.task_handle
.set_new_config(config)
.set_new_config(config.clone())
.await
.expect("can communicate with task")
.expect_err("config should fail");
Expand All @@ -1081,20 +1089,48 @@ mod tests {

// Change the config to reference the artifact that does exist; this one
// should be accepted.
let config = OmicronSledConfig {
generation: Generation::new().next(),
disks: IdMap::new(),
datasets: IdMap::new(),
zones: [make_dummy_zone_config_using_artifact_hash(
existing_artifact_hash,
)]
.into_iter()
.collect(),
remove_mupdate_override: None,
host_phase_2: HostPhase2DesiredSlots::current_contents(),
config.zones = [make_dummy_zone_config_using_artifact_hash(
existing_artifact_hash,
)]
.into_iter()
.collect();

test_harness
.task_handle
.set_new_config(config.clone())
.await
.expect("can communicate with task")
.expect("config should be ledgered");

// Try a config that references a host phase 2 artifact that isn't in
// the store; this should be rejected.
config.generation = config.generation.next();
config.zones = IdMap::new();
config.host_phase_2 = HostPhase2DesiredSlots {
slot_a: HostPhase2DesiredContents::CurrentContents,
slot_b: HostPhase2DesiredContents::Artifact {
hash: nonexisting_artifact_hash,
},
};
let err = test_harness
.task_handle
.set_new_config(config.clone())
.await
.expect("can communicate with task")
.expect_err("config should fail");
match err {
LedgerNewConfigError::ArtifactStoreValidationFailed(_) => (),
_ => panic!("unexpected error {}", InlineErrorChain::new(&err)),
}

// TODO-john also test host phase 2 artifacts!
// Change the config to reference the artifact that does exist; this one
// should be accepted.
config.host_phase_2 = HostPhase2DesiredSlots {
slot_a: HostPhase2DesiredContents::CurrentContents,
slot_b: HostPhase2DesiredContents::Artifact {
hash: existing_artifact_hash,
},
};

test_harness
.task_handle
Expand All @@ -1112,54 +1148,84 @@ mod tests {
"reject_artifact_configs_removing_referenced_artifacts",
);

// Claim we have a zone using the all-zeroes hash.
let used_artifact_hash = ArtifactHash([0; 32]);
let unused_artifact_hash = ArtifactHash([1; 32]);
// Construct some artifacts hashes we'll claim to be using and one we
// won't.
let used_zone_artifact_hash = ArtifactHash([0; 32]);
let used_host_artifact_hash = ArtifactHash([1; 32]);
let unused_artifact_hash = ArtifactHash([2; 32]);

// Claim we have a host phase 2
// Set up the ledger task with an initial config.
let mut sled_config = make_nonempty_sled_config();
sled_config.zones.insert(make_dummy_zone_config_using_artifact_hash(
used_artifact_hash,
used_zone_artifact_hash,
));
sled_config.host_phase_2.slot_a = HostPhase2DesiredContents::Artifact {
hash: used_host_artifact_hash,
};

let test_harness =
TestHarness::with_initial_config(logctx.log.clone(), &sled_config)
.await;

// Construct an `ArtifactConfig` that doesn't contain
// `used_artifact_hash`; this should be rejected.
let mut artifact_config = ArtifactConfig {
generation: Generation::new(),
artifacts: [unused_artifact_hash].into_iter().collect(),
artifacts: BTreeSet::new(),
};
let err = test_harness
.task_handle
.validate_artifact_config(artifact_config.clone())
.await
.expect("no ledger task error")
.expect_err("config should be rejected");

match err {
LedgerArtifactConfigError::InUseArtifactedMissing(artifacts) => {
assert!(
artifacts.contains_key(&used_artifact_hash),
"unexpected error contents: {artifacts:?}"
);
// Try some ArtifactConfigs that don't contain both `used_*` artifact
// hashes; all of these should be rejected.
for incomplete_artifacts in [
&[used_zone_artifact_hash] as &[ArtifactHash],
&[used_host_artifact_hash],
&[unused_artifact_hash],
&[used_zone_artifact_hash, unused_artifact_hash],
&[used_host_artifact_hash, unused_artifact_hash],
] {
artifact_config.artifacts =
incomplete_artifacts.iter().cloned().collect();

let err = test_harness
.task_handle
.validate_artifact_config(artifact_config.clone())
.await
.expect("no ledger task error")
.expect_err("config should be rejected");

match err {
LedgerArtifactConfigError::InUseArtifactedMissing(
artifacts,
) => {
if !incomplete_artifacts.contains(&used_zone_artifact_hash)
{
assert!(
artifacts.contains(&used_zone_artifact_hash),
"unexpected error contents: {artifacts:?}"
);
}
if !incomplete_artifacts.contains(&used_host_artifact_hash)
{
assert!(
artifacts.contains(&used_host_artifact_hash),
"unexpected error contents: {artifacts:?}"
);
}
}
_ => panic!("unexpected error {}", InlineErrorChain::new(&err)),
}
_ => panic!("unexpected error {}", InlineErrorChain::new(&err)),
}

// Put the used artifact hash into the artifact config; now it should
// Put both used artifact hashes into the artifact config; now it should
// pass validation.
artifact_config.artifacts.insert(used_artifact_hash);
artifact_config.artifacts.insert(used_zone_artifact_hash);
artifact_config.artifacts.insert(used_host_artifact_hash);
test_harness
.task_handle
.validate_artifact_config(artifact_config)
.await
.expect("no ledger task error")
.expect("config is valid");

// TODO-john also test host phase 2 artifacts!

logctx.cleanup_successful();
}

Expand Down
Loading