Skip to content

[wip] [22/n] [reconfigurator-planning] support no-op image source updates #8467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ generated inventory collection eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 from configu
> # Try to plan a new blueprint; this should be okay even though the sled
> # we added has no disks.
> blueprint-plan dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51
INFO skipping noop image source check for all sleds (no current TUF repo)
INFO skipping sled (no zpools in service), sled_id: 00320471-945d-413c-85e7-03e091a70b3c
INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0
INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,7 @@ T ENA ID PARENT
* yes ade5749d-bdf3-4fab-a8ae-00bea01b3a5a 02697f74-b14a-4418-90f0-c28b2a3a6aa9 <REDACTED_TIMESTAMP>

> blueprint-plan ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
INFO skipping noop image source check for all sleds (no current TUF repo)
INFO found sled missing NTP zone (will add one), sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b
INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0
WARN failed to place all new desired Clickhouse zones, placed: 0, wanted_to_place: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -969,6 +969,7 @@ parent: 3f00b694-1b16-4aaa-8f78-e6b3a527b434

> # blueprint-plan will place a new external DNS zone, diff DNS to see the new zone has `ns<N>` and NS records.
> blueprint-plan 366b0b68-d80e-4bc1-abd3-dc69837847e0
INFO skipping noop image source check for all sleds (no current TUF repo)
INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0
INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1
INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,7 @@ external DNS:

> # Planning a new blueprint will now replace the expunged zone, with new records for its replacement.
> blueprint-plan 58d5e830-0884-47d8-a7cd-b2b3751adeb4
INFO skipping noop image source check for all sleds (no current TUF repo)
INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0
INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1
INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0
Expand Down
25 changes: 25 additions & 0 deletions nexus/reconfigurator/planning/src/blueprint_builder/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ pub(crate) enum Operation {
num_datasets_expunged: usize,
num_zones_expunged: usize,
},
SledNoopZoneImageSourcesUpdated {
sled_id: SledUuid,
count: usize,
},
}

impl fmt::Display for Operation {
Expand Down Expand Up @@ -367,6 +371,13 @@ impl fmt::Display for Operation {
{num_zones_expunged} zones)"
)
}
Self::SledNoopZoneImageSourcesUpdated { sled_id, count } => {
write!(
f,
"sled {sled_id}: performed {count} noop \
zone image source updates"
)
}
}
}
}
Expand Down Expand Up @@ -1133,6 +1144,20 @@ impl<'a> BlueprintBuilder<'a> {
Ok(datasets.into())
}

/// Returns the remove_mupdate_override field for a sled.
pub(crate) fn sled_get_remove_mupdate_override(
&self,
sled_id: SledUuid,
) -> Result<Option<MupdateOverrideUuid>, Error> {
let editor = self.sled_editors.get(&sled_id).ok_or_else(|| {
Error::Planner(anyhow!(
"tried to get remove_mupdate_override for \
unknown sled {sled_id}"
))
})?;
Ok(editor.get_remove_mupdate_override())
}

fn next_internal_dns_gz_address_index(&self, sled_id: SledUuid) -> u32 {
let used_internal_dns_gz_address_indices = self
.current_sled_zones(
Expand Down
14 changes: 12 additions & 2 deletions nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,18 @@ impl SledEditor {
}
}

/// Returns the remove_mupdate_override field for this sled.
pub fn get_remove_mupdate_override(&self) -> Option<MupdateOverrideUuid> {
match &self.0 {
InnerSledEditor::Active(editor) => {
*editor.remove_mupdate_override.value()
}
InnerSledEditor::Decommissioned(sled) => {
sled.config.remove_mupdate_override
}
}
}

fn as_active_mut(
&mut self,
) -> Result<&mut ActiveSledEditor, SledEditError> {
Expand Down Expand Up @@ -343,8 +355,6 @@ impl SledEditor {
}

/// Sets the image source for a zone.
///
/// Currently only used by test code.
pub fn set_zone_image_source(
&mut self,
zone_id: &OmicronZoneUuid,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ impl<T> ScalarEditor<T> {
ScalarEditor { original, value: EditValue::Original }
}

#[expect(dead_code)]
pub(crate) fn value(&self) -> &T {
match &self.value {
EditValue::Original => &self.original,
Expand Down
163 changes: 163 additions & 0 deletions nexus/reconfigurator/planning/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use nexus_types::deployment::Blueprint;
use nexus_types::deployment::BlueprintPhysicalDiskDisposition;
use nexus_types::deployment::BlueprintZoneConfig;
use nexus_types::deployment::BlueprintZoneDisposition;
use nexus_types::deployment::BlueprintZoneImageSource;
use nexus_types::deployment::CockroachDbClusterVersion;
use nexus_types::deployment::CockroachDbPreserveDowngrade;
use nexus_types::deployment::CockroachDbSettings;
Expand All @@ -37,10 +38,12 @@ use nexus_types::inventory::Collection;
use omicron_common::policy::INTERNAL_DNS_REDUNDANCY;
use omicron_uuid_kinds::PhysicalDiskUuid;
use omicron_uuid_kinds::SledUuid;
use slog::debug;
use slog::error;
use slog::{Logger, info, warn};
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::str::FromStr;

pub(crate) use self::omicron_zone_placement::DiscretionaryOmicronZone;
Expand Down Expand Up @@ -147,6 +150,7 @@ impl<'a> Planner<'a> {
fn do_plan(&mut self) -> Result<(), Error> {
self.do_plan_expunge()?;
self.do_plan_decommission()?;
self.do_plan_noop_image_source()?;
self.do_plan_add()?;
if let UpdateStepResult::ContinueToNextStep = self.do_plan_mgs_updates()
{
Expand Down Expand Up @@ -493,6 +497,165 @@ impl<'a> Planner<'a> {
Ok(())
}

fn do_plan_noop_image_source(&mut self) -> Result<(), Error> {
let Some(current_artifacts) = self.input.tuf_repo().description()
else {
info!(
self.log,
"skipping noop image source check for all sleds \
(no current TUF repo)",
);
return Ok(());
};
let artifacts_by_hash: HashMap<_, _> = current_artifacts
.artifacts
.iter()
.map(|artifact| (artifact.hash, artifact))
.collect();

for sled_id in self.input.all_sled_ids(SledFilter::InService) {
let Some(inv_sled) = self.inventory.sled_agents.get(&sled_id)
else {
info!(
self.log,
"skipping noop image source check \
(sled not present in latest inventory collection)";
"sled_id" => %sled_id,
);
continue;
};

let zone_manifest = match &inv_sled
.zone_image_resolver
.zone_manifest
.boot_inventory
{
Ok(zm) => zm,
Err(message) => {
// This is a string so we don't use InlineErrorChain::new.
let message: &str = message;
warn!(
self.log,
"skipping noop image source check since \
sled-agent encountered error retrieving zone manifest \
(this is abnormal)";
"sled_id" => %sled_id,
"error" => %message,
);
continue;
}
};

// Does the blueprint have the remove_mupdate_override field set for
// this sled? If it does, we don't want to touch the zones on this
// sled (they should all be InstallDataset until the
// remove_mupdate_override field is cleared).
if let Some(id) =
self.blueprint.sled_get_remove_mupdate_override(sled_id)?
{
debug!(
self.log,
"skipping noop image source check on sled \
(blueprint has get_remove_mupdate_override set for sled)";
"sled_id" => %sled_id,
"bp_remove_mupdate_override_id" => %id,
);
continue;
}

// Which zones have image sources set to InstallDataset?
let install_dataset_zones = self
.blueprint
.current_sled_zones(
sled_id,
BlueprintZoneDisposition::is_in_service,
)
.filter(|z| {
z.image_source == BlueprintZoneImageSource::InstallDataset
});

// Out of these, which zones' hashes (as reported in the zone
// manifest) match the corresponding ones in the TUF repo?
let matching_zones: Vec<_> = install_dataset_zones
.filter_map(|z| {
let file_name =
format!("{}.tar.gz", z.kind().artifact_name());
let Some(artifact) =
zone_manifest.artifacts.get(file_name.as_str())
else {
// The blueprint indicates that a zone should be present
// that isn't in the install dataset. This might be an old
// install dataset with a zone kind known to this version of
// Nexus that isn't present in it. Not normally a cause for
// concern.
debug!(
self.log,
"blueprint zone not found in zone manifest, \
ignoring for noop checks";
"sled_id" => %sled_id,
"zone_id" => %z.id,
"kind" => z.kind().report_str(),
"file_name" => file_name,
);
return None;
};
if let Err(message) = &artifact.status {
// The artifact is somehow invalid and corrupt -- definitely
// something to warn about and not proceed.
warn!(
self.log,
"zone manifest inventory indicated install dataset \
artifact is invalid, not using artifact (this is \
abnormal)";
"sled_id" => %sled_id,
"zone_id" => %z.id,
"kind" => z.kind().report_str(),
"file_name" => file_name,
"error" => %message,
);
return None;
}

// Does the hash match what's in the TUF repo?
let Some(tuf_artifact) =
artifacts_by_hash.get(&artifact.expected_hash)
else {
return None;
};

info!(
self.log,
"install dataset artifact hash matches TUF repo, \
switching out the zone image source to Artifact";
"sled_id" => %sled_id,
"tuf_artifact_id" => %tuf_artifact.id,
);
Some((z.id, tuf_artifact))
})
.collect();

// Set all these zones' image sources to the corresponding
// blueprint.
for (zone_id, tuf_artifact) in &matching_zones {
self.blueprint.sled_set_zone_source(
sled_id,
*zone_id,
BlueprintZoneImageSource::from_available_artifact(
tuf_artifact,
),
)?;
self.blueprint.record_operation(
Operation::SledNoopZoneImageSourcesUpdated {
sled_id,
count: matching_zones.len(),
},
);
}
}

Ok(())
}

fn do_plan_add(&mut self) -> Result<(), Error> {
// Internal DNS is a prerequisite for bringing up all other zones. At
// this point, we assume that internal DNS (as a service) is already
Expand Down
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.