diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout index 92718db5696..6b349bc4dae 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout @@ -36,6 +36,7 @@ generated inventory collection eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 from configu > # Try to plan a new blueprint; this should be okay even though the sled > # we added has no disks. > blueprint-plan dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 +INFO skipping noop image source check for all sleds (no current TUF repo) INFO skipping sled (no zpools in service), sled_id: 00320471-945d-413c-85e7-03e091a70b3c INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 50c14f73d0a..486e8dcad0c 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -493,6 +493,7 @@ T ENA ID PARENT * yes ade5749d-bdf3-4fab-a8ae-00bea01b3a5a 02697f74-b14a-4418-90f0-c28b2a3a6aa9 > blueprint-plan ade5749d-bdf3-4fab-a8ae-00bea01b3a5a +INFO skipping noop image source check for all sleds (no current TUF repo) INFO found sled missing NTP zone (will add one), sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 WARN failed to place all new desired Clickhouse zones, placed: 0, wanted_to_place: 1 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index 96d99da31ee..8732ed80b6e 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -969,6 +969,7 @@ parent: 3f00b694-1b16-4aaa-8f78-e6b3a527b434 > # blueprint-plan will place a new external DNS zone, diff DNS to see the new zone has `ns` and NS records. > blueprint-plan 366b0b68-d80e-4bc1-abd3-dc69837847e0 +INFO skipping noop image source check for all sleds (no current TUF repo) INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout index a2125c4f88e..132fdeb474f 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout @@ -1002,6 +1002,7 @@ external DNS: > # Planning a new blueprint will now replace the expunged zone, with new records for its replacement. > blueprint-plan 58d5e830-0884-47d8-a7cd-b2b3751adeb4 +INFO skipping noop image source check for all sleds (no current TUF repo) INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 34816548d49..de0812ab390 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -303,6 +303,10 @@ pub(crate) enum Operation { num_datasets_expunged: usize, num_zones_expunged: usize, }, + SledNoopZoneImageSourcesUpdated { + sled_id: SledUuid, + count: usize, + }, } impl fmt::Display for Operation { @@ -367,6 +371,13 @@ impl fmt::Display for Operation { {num_zones_expunged} zones)" ) } + Self::SledNoopZoneImageSourcesUpdated { sled_id, count } => { + write!( + f, + "sled {sled_id}: performed {count} noop \ + zone image source updates" + ) + } } } } @@ -1133,6 +1144,20 @@ impl<'a> BlueprintBuilder<'a> { Ok(datasets.into()) } + /// Returns the remove_mupdate_override field for a sled. + pub(crate) fn sled_get_remove_mupdate_override( + &self, + sled_id: SledUuid, + ) -> Result, Error> { + let editor = self.sled_editors.get(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to get remove_mupdate_override for \ + unknown sled {sled_id}" + )) + })?; + Ok(editor.get_remove_mupdate_override()) + } + fn next_internal_dns_gz_address_index(&self, sled_id: SledUuid) -> u32 { let used_internal_dns_gz_address_indices = self .current_sled_zones( diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs index edb84d97d42..202584c93e0 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs @@ -286,6 +286,18 @@ impl SledEditor { } } + /// Returns the remove_mupdate_override field for this sled. + pub fn get_remove_mupdate_override(&self) -> Option { + match &self.0 { + InnerSledEditor::Active(editor) => { + *editor.remove_mupdate_override.value() + } + InnerSledEditor::Decommissioned(sled) => { + sled.config.remove_mupdate_override + } + } + } + fn as_active_mut( &mut self, ) -> Result<&mut ActiveSledEditor, SledEditError> { @@ -343,8 +355,6 @@ impl SledEditor { } /// Sets the image source for a zone. - /// - /// Currently only used by test code. pub fn set_zone_image_source( &mut self, zone_id: &OmicronZoneUuid, diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/scalar.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/scalar.rs index 74eea138f86..ceafd15606d 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/scalar.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/scalar.rs @@ -16,7 +16,6 @@ impl ScalarEditor { ScalarEditor { original, value: EditValue::Original } } - #[expect(dead_code)] pub(crate) fn value(&self) -> &T { match &self.value { EditValue::Original => &self.original, diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 215cdbc488e..c1d4de11745 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -22,6 +22,7 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::CockroachDbSettings; @@ -37,10 +38,12 @@ use nexus_types::inventory::Collection; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; +use slog::debug; use slog::error; use slog::{Logger, info, warn}; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::collections::HashMap; use std::str::FromStr; pub(crate) use self::omicron_zone_placement::DiscretionaryOmicronZone; @@ -147,6 +150,7 @@ impl<'a> Planner<'a> { fn do_plan(&mut self) -> Result<(), Error> { self.do_plan_expunge()?; self.do_plan_decommission()?; + self.do_plan_noop_image_source()?; self.do_plan_add()?; if let UpdateStepResult::ContinueToNextStep = self.do_plan_mgs_updates() { @@ -493,6 +497,165 @@ impl<'a> Planner<'a> { Ok(()) } + fn do_plan_noop_image_source(&mut self) -> Result<(), Error> { + let Some(current_artifacts) = self.input.tuf_repo().description() + else { + info!( + self.log, + "skipping noop image source check for all sleds \ + (no current TUF repo)", + ); + return Ok(()); + }; + let artifacts_by_hash: HashMap<_, _> = current_artifacts + .artifacts + .iter() + .map(|artifact| (artifact.hash, artifact)) + .collect(); + + for sled_id in self.input.all_sled_ids(SledFilter::InService) { + let Some(inv_sled) = self.inventory.sled_agents.get(&sled_id) + else { + info!( + self.log, + "skipping noop image source check \ + (sled not present in latest inventory collection)"; + "sled_id" => %sled_id, + ); + continue; + }; + + let zone_manifest = match &inv_sled + .zone_image_resolver + .zone_manifest + .boot_inventory + { + Ok(zm) => zm, + Err(message) => { + // This is a string so we don't use InlineErrorChain::new. + let message: &str = message; + warn!( + self.log, + "skipping noop image source check since \ + sled-agent encountered error retrieving zone manifest \ + (this is abnormal)"; + "sled_id" => %sled_id, + "error" => %message, + ); + continue; + } + }; + + // Does the blueprint have the remove_mupdate_override field set for + // this sled? If it does, we don't want to touch the zones on this + // sled (they should all be InstallDataset until the + // remove_mupdate_override field is cleared). + if let Some(id) = + self.blueprint.sled_get_remove_mupdate_override(sled_id)? + { + debug!( + self.log, + "skipping noop image source check on sled \ + (blueprint has get_remove_mupdate_override set for sled)"; + "sled_id" => %sled_id, + "bp_remove_mupdate_override_id" => %id, + ); + continue; + } + + // Which zones have image sources set to InstallDataset? + let install_dataset_zones = self + .blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter(|z| { + z.image_source == BlueprintZoneImageSource::InstallDataset + }); + + // Out of these, which zones' hashes (as reported in the zone + // manifest) match the corresponding ones in the TUF repo? + let matching_zones: Vec<_> = install_dataset_zones + .filter_map(|z| { + let file_name = + format!("{}.tar.gz", z.kind().artifact_name()); + let Some(artifact) = + zone_manifest.artifacts.get(file_name.as_str()) + else { + // The blueprint indicates that a zone should be present + // that isn't in the install dataset. This might be an old + // install dataset with a zone kind known to this version of + // Nexus that isn't present in it. Not normally a cause for + // concern. + debug!( + self.log, + "blueprint zone not found in zone manifest, \ + ignoring for noop checks"; + "sled_id" => %sled_id, + "zone_id" => %z.id, + "kind" => z.kind().report_str(), + "file_name" => file_name, + ); + return None; + }; + if let Err(message) = &artifact.status { + // The artifact is somehow invalid and corrupt -- definitely + // something to warn about and not proceed. + warn!( + self.log, + "zone manifest inventory indicated install dataset \ + artifact is invalid, not using artifact (this is \ + abnormal)"; + "sled_id" => %sled_id, + "zone_id" => %z.id, + "kind" => z.kind().report_str(), + "file_name" => file_name, + "error" => %message, + ); + return None; + } + + // Does the hash match what's in the TUF repo? + let Some(tuf_artifact) = + artifacts_by_hash.get(&artifact.expected_hash) + else { + return None; + }; + + info!( + self.log, + "install dataset artifact hash matches TUF repo, \ + switching out the zone image source to Artifact"; + "sled_id" => %sled_id, + "tuf_artifact_id" => %tuf_artifact.id, + ); + Some((z.id, tuf_artifact)) + }) + .collect(); + + // Set all these zones' image sources to the corresponding + // blueprint. + for (zone_id, tuf_artifact) in &matching_zones { + self.blueprint.sled_set_zone_source( + sled_id, + *zone_id, + BlueprintZoneImageSource::from_available_artifact( + tuf_artifact, + ), + )?; + self.blueprint.record_operation( + Operation::SledNoopZoneImageSourcesUpdated { + sled_id, + count: matching_zones.len(), + }, + ); + } + } + + Ok(()) + } + fn do_plan_add(&mut self) -> Result<(), Error> { // Internal DNS is a prerequisite for bringing up all other zones. At // this point, we assume that internal DNS (as a service) is already