diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index e91d4d08a65..02df12e7eca 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -45,6 +45,7 @@ use nexus_types::deployment::ClickhouseMode; use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::OximeterReadMode; use nexus_types::deployment::OximeterReadPolicy; +use nexus_types::deployment::WaitCondition; use nexus_types::internal_api::background::AbandonedVmmReaperStatus; use nexus_types::internal_api::background::BlueprintPlannerStatus; use nexus_types::internal_api::background::BlueprintRendezvousStatus; @@ -1218,6 +1219,13 @@ fn print_task_abandoned_vmm_reaper(details: &serde_json::Value) { } fn print_task_blueprint_planner(details: &serde_json::Value) { + fn print_waiting_on(waiting_on: &[WaitCondition]) { + let n = waiting_on.len(); + if n > 0 { + println!(" waiting on {n} events: {waiting_on:?}"); + } + } + let status = match serde_json::from_value::(details.clone()) { @@ -1237,20 +1245,32 @@ fn print_task_blueprint_planner(details: &serde_json::Value) { BlueprintPlannerStatus::Error(error) => { println!(" task did not complete successfully: {error}"); } - BlueprintPlannerStatus::Unchanged { parent_blueprint_id } => { + BlueprintPlannerStatus::Unchanged { + parent_blueprint_id, + waiting_on, + } => { println!(" plan unchanged from parent {parent_blueprint_id}"); + print_waiting_on(&waiting_on); } - BlueprintPlannerStatus::Planned { parent_blueprint_id, error } => { + BlueprintPlannerStatus::Planned { + parent_blueprint_id, + error, + waiting_on, + } => { println!( " planned new blueprint from parent {parent_blueprint_id}, \ but could not make it the target: {error}" ); + print_waiting_on(&waiting_on); } - BlueprintPlannerStatus::Targeted { blueprint_id, .. } => { + BlueprintPlannerStatus::Targeted { + blueprint_id, waiting_on, .. + } => { println!( " planned new blueprint {blueprint_id}, \ and made it the current target" ); + print_waiting_on(&waiting_on); } } } diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 75dfb0ddc31..226ba1dbae3 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -26,6 +26,7 @@ use nexus_reconfigurator_simulation::{BlueprintId, SimState}; use nexus_types::deployment::OmicronZoneNic; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::ZoneExpungeReason; use nexus_types::deployment::execution; use nexus_types::deployment::execution::blueprint_external_dns_config; use nexus_types::deployment::execution::blueprint_internal_dns_config; @@ -1249,7 +1250,11 @@ fn cmd_blueprint_edit( BlueprintEditCommands::ExpungeZone { zone_id } => { let sled_id = sled_with_zone(&builder, &zone_id)?; builder - .sled_expunge_zone(sled_id, zone_id) + .sled_expunge_zone( + sled_id, + zone_id, + &ZoneExpungeReason::ManualEdit, + ) .context("failed to expunge zone")?; format!("expunged zone {zone_id} from sled {sled_id}") } diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 50c14f73d0a..f2a1b37e704 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -505,8 +505,7 @@ INFO sufficient ExternalDns zones exist in plan, desired_count: 0, current_count WARN failed to place all new desired Nexus zones, placed: 0, wanted_to_place: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify +INFO some zones not yet up-to-date generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a > blueprint-list diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index 96d99da31ee..d2fc24fc08f 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -980,8 +980,7 @@ INFO added zone to sled, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, kind: Ex INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO some zones not yet up-to-date, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify +INFO some zones not yet up-to-date generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 > blueprint-diff 366b0b68-d80e-4bc1-abd3-dc69837847e0 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout index a2125c4f88e..719d8568a1d 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout @@ -1013,8 +1013,7 @@ INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO some zones not yet up-to-date, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify +INFO some zones not yet up-to-date generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 > blueprint-diff 58d5e830-0884-47d8-a7cd-b2b3751adeb4 af934083-59b5-4bf6-8966-6fb5292c29e1 diff --git a/dev-tools/reconfigurator-cli/tests/output/target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/target-release-stdout index 45e8e87bc67..044d2f52033 100644 --- a/dev-tools/reconfigurator-cli/tests/output/target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/target-release-stdout @@ -170,7 +170,6 @@ INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 based on parent blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 > blueprint-diff dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 @@ -352,7 +351,6 @@ INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update not yet completed (will keep it), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 based on parent blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 > blueprint-diff 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 58d5e830-0884-47d8-a7cd-b2b3751adeb4 @@ -537,7 +535,6 @@ INFO SP update completed (will remove it and re-evaluate board), artifact_versio INFO skipping board for SP update, serial_number: serial0, part_number: model0 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 > blueprint-diff 58d5e830-0884-47d8-a7cd-b2b3751adeb4 af934083-59b5-4bf6-8966-6fb5292c29e1 @@ -729,7 +726,6 @@ INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update impossible (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: Version(ArtifactVersion("0.5.0")), expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint df06bb57-ad42-4431-9206-abff322896c7 based on parent blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 > blueprint-diff af934083-59b5-4bf6-8966-6fb5292c29e1 df06bb57-ad42-4431-9206-abff322896c7 @@ -922,7 +918,6 @@ INFO skipping board for SP update, serial_number: serial1, part_number: model1 INFO skipping board for SP update, serial_number: serial0, part_number: model0 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO ran out of boards for SP update -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba based on parent blueprint df06bb57-ad42-4431-9206-abff322896c7 > blueprint-diff df06bb57-ad42-4431-9206-abff322896c7 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 8630d94659b..5960881266b 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -19,6 +19,7 @@ use nexus_reconfigurator_planning::planner::Planner; use nexus_reconfigurator_preparation::PlanningInputFromDb; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::ZoneExpungeReason; use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; @@ -123,7 +124,11 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { &nexus, &|builder: &mut BlueprintBuilder| { builder - .sled_expunge_zone(sled_id, new_zone.id) + .sled_expunge_zone( + sled_id, + new_zone.id, + &ZoneExpungeReason::Test, + ) .context("expunging zone")?; Ok(()) }, diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 34816548d49..06bc0886855 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -14,7 +14,6 @@ use crate::blueprint_editor::ExternalSnatNetworkingChoice; use crate::blueprint_editor::NoAvailableDnsSubnets; use crate::blueprint_editor::SledEditError; use crate::blueprint_editor::SledEditor; -use crate::planner::ZoneExpungeReason; use crate::planner::rng::PlannerRng; use anyhow::Context as _; use anyhow::anyhow; @@ -45,6 +44,7 @@ use nexus_types::deployment::PendingMgsUpdates; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledFilter; use nexus_types::deployment::SledResources; +use nexus_types::deployment::ZoneExpungeReason; use nexus_types::deployment::ZpoolFilter; use nexus_types::deployment::ZpoolName; use nexus_types::deployment::blueprint_zone_type; @@ -337,6 +337,13 @@ impl fmt::Display for Operation { ZoneExpungeReason::ClickhouseSingleNodeDisabled => { "clickhouse single-node disabled via policy" } + ZoneExpungeReason::ManualEdit => { + "blueprint edited manually" + } + ZoneExpungeReason::Test => "for testing purposes", + ZoneExpungeReason::UpdatedSource { from, to } => { + &format!("updating from image source {from:?} → {to:?}") + } }; write!( f, @@ -1741,6 +1748,7 @@ impl<'a> BlueprintBuilder<'a> { &mut self, sled_id: SledUuid, zone_id: OmicronZoneUuid, + _reason: &ZoneExpungeReason, ) -> Result { let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { Error::Planner(anyhow!( diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index bf065e95cdf..81af2de062d 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -29,6 +29,8 @@ use nexus_types::deployment::DiskFilter; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledDetails; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::WaitCondition; +use nexus_types::deployment::ZoneExpungeReason; use nexus_types::deployment::ZpoolFilter; use nexus_types::external_api::views::PhysicalDiskPolicy; use nexus_types::external_api::views::SledPolicy; @@ -78,9 +80,72 @@ pub(crate) mod rng; /// services, etc.). const NUM_CONCURRENT_MGS_UPDATES: usize = 1; -enum UpdateStepResult { - ContinueToNextStep, - Waiting, +/// The planner operates in distinct steps, and it is a compile-time error +/// to skip or reorder them; see `Planner::do_plan`. +trait PlannerStep { + type Next: PlannerStep; + fn new() -> Self; + fn into_next(self) -> Self::Next; +} + +macro_rules! planner_step { + ($name: ident, $next: ident) => { + // Non-terminal step. + struct $name; + impl PlannerStep for $name { + type Next = $next; + + fn new() -> Self { + Self + } + + fn into_next(self) -> Self::Next { + Self::Next::new() + } + } + }; + ($name: ident) => { + // Terminal step. + struct $name; + impl PlannerStep for $name { + type Next = Self; + + fn new() -> Self { + Self + } + + fn into_next(self) -> Self::Next { + unreachable!("terminal step has no next") + } + } + }; +} +planner_step!(ExpungeStep, AddStep); +planner_step!(AddStep, DecommissionStep); +planner_step!(DecommissionStep, MgsUpdatesStep); +planner_step!(MgsUpdatesStep, UpdateZonesStep); +planner_step!(UpdateZonesStep, CockroachDbSettingsStep); +planner_step!(CockroachDbSettingsStep, TerminalStep); +planner_step!(TerminalStep); + +enum UpdateStepResult { + ContinueToNextStep(Prev), + PlanningComplete(TerminalStep), + Waiting(Vec), +} + +impl UpdateStepResult { + fn continue_to_next_step(prev: Prev) -> Self { + Self::ContinueToNextStep(prev) + } + + fn planning_complete(prev: CockroachDbSettingsStep) -> Self { + Self::PlanningComplete(prev.into_next()) + } + + fn waiting(wait_on: Vec) -> Self { + Self::Waiting(wait_on) + } } pub struct Planner<'a> { @@ -97,6 +162,7 @@ pub struct Planner<'a> { // information about all sleds that we expect), we should verify that up // front and update callers to ensure that it's true. inventory: &'a Collection, + waiting_on: Vec, } impl<'a> Planner<'a> { @@ -116,7 +182,7 @@ impl<'a> Planner<'a> { inventory, creator, )?; - Ok(Planner { log, input, blueprint, inventory }) + Ok(Planner { log, input, blueprint, inventory, waiting_on: Vec::new() }) } /// Within tests, set a seeded RNG for deterministic results. @@ -136,6 +202,13 @@ impl<'a> Planner<'a> { Ok(self.blueprint.build()) } + pub fn plan_and_wait_conditions( + mut self, + ) -> Result<(Blueprint, Vec), Error> { + let conditions = self.waiting_on.drain(..).collect(); + Ok((self.plan()?, conditions)) + } + fn check_input_validity(&self) -> Result<(), Error> { if self.input.target_internal_dns_zone_count() > INTERNAL_DNS_REDUNDANCY { @@ -145,18 +218,31 @@ impl<'a> Planner<'a> { } fn do_plan(&mut self) -> Result<(), Error> { - self.do_plan_expunge()?; - self.do_plan_add()?; - self.do_plan_decommission()?; - if let UpdateStepResult::ContinueToNextStep = self.do_plan_mgs_updates() - { - self.do_plan_zone_updates()?; + macro_rules! step { + ($f: expr) => {{ + match $f { + UpdateStepResult::ContinueToNextStep(step) => step, + UpdateStepResult::PlanningComplete(_) => return Ok(()), + UpdateStepResult::Waiting(mut waiting_on) => { + self.waiting_on.extend(waiting_on.drain(..)); + return Ok(()); + } + } + }}; } - self.do_plan_cockroachdb_settings(); - Ok(()) + let ready = step!(self.do_plan_expunge()?); + let ready = step!(self.do_plan_add(ready.into_next())?); + let ready = step!(self.do_plan_decommission(ready.into_next())?); + let ready = step!(self.do_plan_mgs_updates(ready.into_next())); + let ready = step!(self.do_plan_zone_updates(ready.into_next())?); + step!(self.do_plan_cockroachdb_settings(ready.into_next())?); + unreachable!("should have reached a terminal state already!"); } - fn do_plan_decommission(&mut self) -> Result<(), Error> { + fn do_plan_decommission( + &mut self, + ready: DecommissionStep, + ) -> Result, Error> { // Check for any sleds that are currently commissioned but can be // decommissioned. Our gates for decommissioning are: // @@ -239,7 +325,7 @@ impl<'a> Planner<'a> { } } - Ok(()) + Ok(UpdateStepResult::continue_to_next_step(ready)) } fn do_plan_decommission_expunged_disks_for_in_service_sled( @@ -289,7 +375,9 @@ impl<'a> Planner<'a> { self.blueprint.sled_decommission_disks(sled_id, disks_to_decommission) } - fn do_plan_expunge(&mut self) -> Result<(), Error> { + fn do_plan_expunge( + &mut self, + ) -> Result, Error> { let mut commissioned_sled_ids = BTreeSet::new(); // Remove services from sleds marked expunged. We use @@ -304,7 +392,7 @@ impl<'a> Planner<'a> { // Check for any decommissioned sleds (i.e., sleds for which our // blueprint has zones, but are not in the input sled list). Any zones - // for decommissioned sleds must have already be expunged for + // for decommissioned sleds must have already been expunged for // decommissioning to have happened; fail if we find non-expunged zones // associated with a decommissioned sled. for sled_id in self.blueprint.sled_ids_with_zones() { @@ -330,7 +418,7 @@ impl<'a> Planner<'a> { } } - Ok(()) + Ok(UpdateStepResult::continue_to_next_step(ExpungeStep)) } fn do_plan_expunge_for_commissioned_sled( @@ -493,7 +581,10 @@ impl<'a> Planner<'a> { Ok(()) } - fn do_plan_add(&mut self) -> Result<(), Error> { + fn do_plan_add( + &mut self, + ready: AddStep, + ) -> Result, Error> { // Internal DNS is a prerequisite for bringing up all other zones. At // this point, we assume that internal DNS (as a service) is already // functioning. @@ -680,7 +771,7 @@ impl<'a> Planner<'a> { // ensure that all sleds have the datasets they need to have. self.do_plan_datasets()?; - Ok(()) + Ok(UpdateStepResult::continue_to_next_step(ready)) } fn do_plan_datasets(&mut self) -> Result<(), Error> { @@ -939,7 +1030,10 @@ impl<'a> Planner<'a> { /// Update at most one MGS-managed device (SP, RoT, etc.), if any are out of /// date. - fn do_plan_mgs_updates(&mut self) -> UpdateStepResult { + fn do_plan_mgs_updates( + &mut self, + ready: MgsUpdatesStep, + ) -> UpdateStepResult { // Determine which baseboards we will consider updating. // // Sleds may be present but not adopted as part of the control plane. @@ -985,17 +1079,21 @@ impl<'a> Planner<'a> { ); // TODO This is not quite right. See oxidecomputer/omicron#8285. - let rv = if next.is_empty() { - UpdateStepResult::ContinueToNextStep + self.blueprint.pending_mgs_updates_replace_all(next.clone()); + if next.is_empty() { + UpdateStepResult::continue_to_next_step(ready) } else { - UpdateStepResult::Waiting - }; - self.blueprint.pending_mgs_updates_replace_all(next); - rv + UpdateStepResult::waiting(vec![WaitCondition::MgsUpdates { + pending: next.clone(), + }]) + } } /// Update at most one existing zone to use a new image source. - fn do_plan_zone_updates(&mut self) -> Result<(), Error> { + fn do_plan_zone_updates( + &mut self, + ready: UpdateZonesStep, + ) -> Result, Error> { // We are only interested in non-decommissioned sleds. let sleds = self .input @@ -1009,25 +1107,26 @@ impl<'a> Planner<'a> { .all_running_omicron_zones() .map(|z| (z.id, z.image_source.clone())) .collect::>(); + let mut waiting_on = Vec::new(); for &sled_id in &sleds { - if !self - .blueprint - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) - .all(|zone| { - let image_source = zone.image_source.clone().into(); - inventory_zones.get(&zone.id) == Some(&image_source) - }) - { - info!( - self.log, "some zones not yet up-to-date"; - "sled_id" => %sled_id, - ); - return Ok(()); + for zone in self.blueprint.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) { + let image_source = zone.image_source.clone().into(); + if inventory_zones.get(&zone.id) != Some(&image_source) { + waiting_on.push(WaitCondition::ZoneUpdate { + sled_id, + zone_id: zone.id, + new_image_source: zone.image_source.clone(), + }); + } } } + if !waiting_on.is_empty() { + info!(self.log, "some zones not yet up-to-date"); + return Ok(UpdateStepResult::waiting(waiting_on)); + } // Update the first out-of-date zone. let out_of_date_zones = sleds @@ -1052,7 +1151,7 @@ impl<'a> Planner<'a> { } info!(self.log, "all zones up-to-date"); - Ok(()) + Ok(UpdateStepResult::continue_to_next_step(ready)) } /// Update a zone to use a new image source, either in-place or by @@ -1061,7 +1160,7 @@ impl<'a> Planner<'a> { &mut self, sled_id: SledUuid, zone: &BlueprintZoneConfig, - ) -> Result<(), Error> { + ) -> Result, Error> { let zone_kind = zone.zone_type.kind(); let image_source = self.blueprint.zone_image_source(zone_kind); if zone.image_source == image_source { @@ -1075,6 +1174,10 @@ impl<'a> Planner<'a> { ); return Err(Error::ZoneAlreadyUpToDate); } else { + let reason = ZoneExpungeReason::UpdatedSource { + from: zone.image_source.clone(), + to: image_source.clone(), + }; match zone_kind { ZoneKind::Crucible | ZoneKind::Clickhouse @@ -1096,8 +1199,15 @@ impl<'a> Planner<'a> { self.blueprint.sled_set_zone_source( sled_id, zone.id, - image_source, + image_source.clone(), )?; + Ok(UpdateStepResult::waiting(vec![ + (WaitCondition::zone_update( + sled_id, + zone.id, + image_source, + )), + ])) } ZoneKind::BoundaryNtp | ZoneKind::CruciblePantry @@ -1117,14 +1227,20 @@ impl<'a> Planner<'a> { zone.zone_type.kind(), zone.id )); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint + .sled_expunge_zone(sled_id, zone.id, &reason)?; + Ok(UpdateStepResult::waiting(vec![ + WaitCondition::zone_expunge(sled_id, zone.id, reason), + ])) } } } - Ok(()) } - fn do_plan_cockroachdb_settings(&mut self) { + fn do_plan_cockroachdb_settings( + &mut self, + ready: CockroachDbSettingsStep, + ) -> Result, Error> { // Figure out what we should set the CockroachDB "preserve downgrade // option" setting to based on the planning input. // @@ -1216,17 +1332,9 @@ impl<'a> Planner<'a> { // cluster version -- we're likely in the middle of an upgrade! // // https://www.cockroachlabs.com/docs/stable/cluster-settings#change-a-cluster-setting - } -} -/// The reason a sled's zones need to be expunged. -/// -/// This is used only for introspection and logging -- it's not part of the -/// logical flow. -#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] -pub(crate) enum ZoneExpungeReason { - ClickhouseClusterDisabled, - ClickhouseSingleNodeDisabled, + Ok(UpdateStepResult::planning_complete(ready)) + } } #[cfg(test)] diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index ce3c2adb610..7f683c15de3 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -142,7 +142,7 @@ impl BlueprintPlanner { )); } }; - let blueprint = match planner.plan() { + let (blueprint, waiting_on) = match planner.plan_and_wait_conditions() { Ok(blueprint) => blueprint, Err(error) => { error!(&opctx.log, "can't plan: {error}"); @@ -161,7 +161,10 @@ impl BlueprintPlanner { "blueprint unchanged from current target"; "parent_blueprint_id" => %parent_blueprint_id, ); - return BlueprintPlannerStatus::Unchanged { parent_blueprint_id }; + return BlueprintPlannerStatus::Unchanged { + parent_blueprint_id, + waiting_on, + }; } // We have a fresh blueprint; save it. @@ -227,13 +230,18 @@ impl BlueprintPlanner { return BlueprintPlannerStatus::Planned { parent_blueprint_id, error: format!("{error}"), + waiting_on, }; } } // We have a new target! self.tx_blueprint.send_replace(Some(Arc::new((target, blueprint)))); - BlueprintPlannerStatus::Targeted { parent_blueprint_id, blueprint_id } + BlueprintPlannerStatus::Targeted { + parent_blueprint_id, + blueprint_id, + waiting_on, + } } } @@ -314,6 +322,7 @@ mod test { BlueprintPlannerStatus::Targeted { parent_blueprint_id, blueprint_id, + waiting_on: _, } if parent_blueprint_id == initial_blueprint.id && blueprint_id != initial_blueprint.id => { @@ -343,6 +352,7 @@ mod test { status, BlueprintPlannerStatus::Unchanged { parent_blueprint_id: blueprint_id, + waiting_on: vec![], } ); @@ -405,6 +415,7 @@ mod test { status, BlueprintPlannerStatus::Unchanged { parent_blueprint_id: blueprint_id, + waiting_on: vec![], } ); } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 420b03d0ad7..439a753f6da 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -1827,6 +1827,56 @@ pub struct UnstableReconfiguratorState { pub external_dns_zone_names: Vec, } +/// An event that the planner is waiting on. +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub enum WaitCondition { + /// Waiting for MGS to apply some updates to the RoT/SP. + MgsUpdates { pending: PendingMgsUpdates }, + /// Waiting for a zone to be expunged. + ZoneExpunge { + sled_id: SledUuid, + zone_id: OmicronZoneUuid, + reason: ZoneExpungeReason, + }, + /// Waiting for a zone with the updated source to appear in inventory. + ZoneUpdate { + sled_id: SledUuid, + zone_id: OmicronZoneUuid, + new_image_source: BlueprintZoneImageSource, + }, +} + +impl WaitCondition { + pub fn zone_expunge( + sled_id: SledUuid, + zone_id: OmicronZoneUuid, + reason: ZoneExpungeReason, + ) -> Self { + Self::ZoneExpunge { sled_id, zone_id, reason } + } + + pub fn zone_update( + sled_id: SledUuid, + zone_id: OmicronZoneUuid, + new_image_source: BlueprintZoneImageSource, + ) -> Self { + Self::ZoneUpdate { sled_id, zone_id, new_image_source } + } +} + +/// The reason a zone need to be expunged. +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub enum ZoneExpungeReason { + ClickhouseClusterDisabled, + ClickhouseSingleNodeDisabled, + ManualEdit, + Test, + UpdatedSource { + from: BlueprintZoneImageSource, + to: BlueprintZoneImageSource, + }, +} + #[cfg(test)] mod test { use super::ExpectedVersion; diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index ca97f5f8924..f034d3b0fbc 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::deployment::WaitCondition; use crate::external_api::views; use chrono::DateTime; use chrono::Utc; @@ -471,15 +472,26 @@ pub enum BlueprintPlannerStatus { /// Planning produced a blueprint identital to the current target, /// so we threw it away and did nothing. - Unchanged { parent_blueprint_id: BlueprintUuid }, + Unchanged { + parent_blueprint_id: BlueprintUuid, + waiting_on: Vec, + }, /// Planning produced a new blueprint, but we failed to make it /// the current target and so deleted it. - Planned { parent_blueprint_id: BlueprintUuid, error: String }, + Planned { + parent_blueprint_id: BlueprintUuid, + error: String, + waiting_on: Vec, + }, /// Planing succeeded, and we saved and made the new blueprint the /// current target. - Targeted { parent_blueprint_id: BlueprintUuid, blueprint_id: BlueprintUuid }, + Targeted { + parent_blueprint_id: BlueprintUuid, + blueprint_id: BlueprintUuid, + waiting_on: Vec, + }, } /// The status of a `alert_dispatcher` background task activation.