Skip to content

Commit c3ec70c

Browse files
authored
[reconfigurator] rewrite zones_currently_updating to be clearer (#8594)
This is semantically the same as the prior implementation but is hopefully easier to understand. Both @jgallagher and I were a bit confused about the previous implementation, which is a sign that a rewrite might help.
1 parent 248e7ed commit c3ec70c

File tree

5 files changed

+107
-10
lines changed

5 files changed

+107
-10
lines changed

dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ INFO sufficient ExternalDns zones exist in plan, desired_count: 0, current_count
506506
WARN failed to place all new desired Nexus zones, placed: 0, wanted_to_place: 3
507507
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
508508
WARN cannot issue more SP updates (no current artifacts)
509-
INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b, zones_currently_updating: [(b3c9c041-d2f0-4767-bdaf-0e52e9d7a013 (service), InternalNtp)]
509+
INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: b3c9c041-d2f0-4767-bdaf-0e52e9d7a013 (service), zone_kind: InternalNtp, reason: MissingInInventory { bp_image_source: InstallDataset } }]
510510
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
511511
generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
512512

dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -981,7 +981,7 @@ INFO added zone to sled, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, kind: Ex
981981
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
982982
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
983983
WARN cannot issue more SP updates (no current artifacts)
984-
INFO some zones not yet up-to-date, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, zones_currently_updating: [(fe2d5287-24e3-4071-b214-2640b097a759 (service), ExternalDns)]
984+
INFO some zones not yet up-to-date, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: fe2d5287-24e3-4071-b214-2640b097a759 (service), zone_kind: ExternalDns, reason: MissingInInventory { bp_image_source: InstallDataset } }]
985985
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
986986
generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0
987987

dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count
10141014
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
10151015
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
10161016
WARN cannot issue more SP updates (no current artifacts)
1017-
INFO some zones not yet up-to-date, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, zones_currently_updating: [(e375dd21-320b-43b7-bc92-a2c3dac9d9e1 (service), InternalDns)]
1017+
INFO some zones not yet up-to-date, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: e375dd21-320b-43b7-bc92-a2c3dac9d9e1 (service), zone_kind: InternalDns, reason: MissingInInventory { bp_image_source: InstallDataset } }]
10181018
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
10191019
generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4
10201020

nexus/reconfigurator/planning/src/planner.rs

Lines changed: 89 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ use crate::blueprint_editor::SledEditError;
1616
use crate::mgs_updates::plan_mgs_updates;
1717
use crate::planner::omicron_zone_placement::PlacementError;
1818
use gateway_client::types::SpType;
19+
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
20+
use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
1921
use nexus_sled_agent_shared::inventory::OmicronZoneType;
2022
use nexus_sled_agent_shared::inventory::ZoneKind;
2123
use nexus_types::deployment::Blueprint;
@@ -39,6 +41,7 @@ use nexus_types::external_api::views::SledState;
3941
use nexus_types::inventory::Collection;
4042
use omicron_common::policy::COCKROACHDB_REDUNDANCY;
4143
use omicron_common::policy::INTERNAL_DNS_REDUNDANCY;
44+
use omicron_uuid_kinds::OmicronZoneUuid;
4245
use omicron_uuid_kinds::PhysicalDiskUuid;
4346
use omicron_uuid_kinds::SledUuid;
4447
use slog::debug;
@@ -1224,22 +1227,101 @@ impl<'a> Planner<'a> {
12241227
// Wait for zones to appear up-to-date in the inventory.
12251228
let inventory_zones = self
12261229
.inventory
1227-
.all_running_omicron_zones()
1228-
.map(|z| (z.id, z.image_source.clone()))
1230+
.all_reconciled_omicron_zones()
1231+
.map(|(z, sa_result)| (z.id, (&z.image_source, sa_result)))
12291232
.collect::<BTreeMap<_, _>>();
1233+
1234+
#[derive(Debug)]
1235+
#[expect(dead_code)]
1236+
struct ZoneCurrentlyUpdating<'a> {
1237+
zone_id: OmicronZoneUuid,
1238+
zone_kind: ZoneKind,
1239+
reason: UpdatingReason<'a>,
1240+
}
1241+
1242+
#[derive(Debug)]
1243+
#[expect(dead_code)]
1244+
enum UpdatingReason<'a> {
1245+
ImageSourceMismatch {
1246+
bp_image_source: &'a BlueprintZoneImageSource,
1247+
inv_image_source: &'a OmicronZoneImageSource,
1248+
},
1249+
MissingInInventory {
1250+
bp_image_source: &'a BlueprintZoneImageSource,
1251+
},
1252+
ReconciliationError {
1253+
bp_image_source: &'a BlueprintZoneImageSource,
1254+
inv_image_source: &'a OmicronZoneImageSource,
1255+
message: &'a str,
1256+
},
1257+
}
1258+
12301259
for &sled_id in &sleds {
1260+
// Build a list of zones currently in the blueprint but where
1261+
// inventory has a mismatch or does not know about the zone.
1262+
//
1263+
// What about the case where a zone is in inventory but not in the
1264+
// blueprint? See
1265+
// https://github.com/oxidecomputer/omicron/issues/8589.
12311266
let zones_currently_updating = self
12321267
.blueprint
12331268
.current_sled_zones(
12341269
sled_id,
12351270
BlueprintZoneDisposition::is_in_service,
12361271
)
12371272
.filter_map(|zone| {
1238-
let image_source = zone.image_source.clone().into();
1239-
if inventory_zones.get(&zone.id) != Some(&image_source) {
1240-
Some((zone.id, zone.zone_type.kind()))
1241-
} else {
1242-
None
1273+
let bp_image_source =
1274+
OmicronZoneImageSource::from(zone.image_source.clone());
1275+
match inventory_zones.get(&zone.id) {
1276+
Some((
1277+
inv_image_source,
1278+
ConfigReconcilerInventoryResult::Ok,
1279+
)) if *inv_image_source == &bp_image_source => {
1280+
// The inventory and blueprint image sources match
1281+
// -- this means that the zone is up-to-date.
1282+
None
1283+
}
1284+
Some((
1285+
inv_image_source,
1286+
ConfigReconcilerInventoryResult::Ok,
1287+
)) => {
1288+
// The inventory and blueprint image sources differ.
1289+
Some(ZoneCurrentlyUpdating {
1290+
zone_id: zone.id,
1291+
zone_kind: zone.kind(),
1292+
reason: UpdatingReason::ImageSourceMismatch {
1293+
bp_image_source: &zone.image_source,
1294+
inv_image_source,
1295+
},
1296+
})
1297+
}
1298+
Some((
1299+
inv_image_source,
1300+
ConfigReconcilerInventoryResult::Err { message },
1301+
)) => {
1302+
// The inventory reports this zone but there was an
1303+
// error reconciling it (most likely an error
1304+
// starting the zone).
1305+
Some(ZoneCurrentlyUpdating {
1306+
zone_id: zone.id,
1307+
zone_kind: zone.kind(),
1308+
reason: UpdatingReason::ReconciliationError {
1309+
bp_image_source: &zone.image_source,
1310+
inv_image_source,
1311+
message,
1312+
},
1313+
})
1314+
}
1315+
None => {
1316+
// The blueprint has a zone that inventory does not have.
1317+
Some(ZoneCurrentlyUpdating {
1318+
zone_id: zone.id,
1319+
zone_kind: zone.kind(),
1320+
reason: UpdatingReason::MissingInInventory {
1321+
bp_image_source: &zone.image_source,
1322+
},
1323+
})
1324+
}
12431325
}
12441326
})
12451327
.collect::<Vec<_>>();

nexus/types/src/inventory.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use iddqd::IdOrdItem;
2323
use iddqd::IdOrdMap;
2424
use iddqd::id_upcast;
2525
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory;
26+
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
2627
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
2728
use nexus_sled_agent_shared::inventory::InventoryDataset;
2829
use nexus_sled_agent_shared::inventory::InventoryDisk;
@@ -197,6 +198,20 @@ impl Collection {
197198
.flat_map(|reconciliation| reconciliation.running_omicron_zones())
198199
}
199200

201+
/// Iterate over all the Omicron zones along with their statuses (as
202+
/// reported by each sled-agent's last reconciliation attempt)
203+
pub fn all_reconciled_omicron_zones(
204+
&self,
205+
) -> impl Iterator<Item = (&OmicronZoneConfig, &ConfigReconcilerInventoryResult)>
206+
{
207+
self.sled_agents
208+
.iter()
209+
.filter_map(|sa| sa.last_reconciliation.as_ref())
210+
.flat_map(|reconciliation| {
211+
reconciliation.reconciled_omicron_zones()
212+
})
213+
}
214+
200215
/// Iterate over the sled ids of sleds identified as Scrimlets
201216
pub fn scrimlets(&self) -> impl Iterator<Item = SledUuid> + '_ {
202217
self.sled_agents.iter().filter_map(|sa| {

0 commit comments

Comments
 (0)