diff --git a/Cargo.lock b/Cargo.lock index 1c920b3397..5012049ae9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12174,6 +12174,7 @@ dependencies = [ "slog", "slog-error-chain", "strum 0.27.1", + "swrite", "thiserror 2.0.12", "toml 0.8.23", "tufaceous-artifact", diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index f86c80241a..3bb0546360 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1123,6 +1123,7 @@ LEDGERED SLED CONFIG slot A details UNAVAILABLE: constructed via debug_assume_success() slot B details UNAVAILABLE: constructed via debug_assume_success() last reconciled config: matches ledgered config + no mupdate override to clear no orphaned datasets all disks reconciled successfully all datasets reconciled successfully @@ -1230,6 +1231,7 @@ LEDGERED SLED CONFIG slot A details UNAVAILABLE: constructed via debug_assume_success() slot B details UNAVAILABLE: constructed via debug_assume_success() last reconciled config: matches ledgered config + no mupdate override to clear no orphaned datasets all disks reconciled successfully all datasets reconciled successfully @@ -1430,6 +1432,7 @@ LEDGERED SLED CONFIG slot A details UNAVAILABLE: constructed via debug_assume_success() slot B details UNAVAILABLE: constructed via debug_assume_success() last reconciled config: matches ledgered config + no mupdate override to clear no orphaned datasets all disks reconciled successfully all datasets reconciled successfully diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index d76c1a21a5..002c441bb7 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -182,6 +182,7 @@ LEDGERED SLED CONFIG slot A details UNAVAILABLE: constructed via debug_assume_success() slot B details UNAVAILABLE: constructed via debug_assume_success() last reconciled config: matches ledgered config + error reading mupdate override, so sled agent was not instructed to clear it no orphaned datasets all disks reconciled successfully all datasets reconciled successfully @@ -288,6 +289,7 @@ LEDGERED SLED CONFIG slot A details UNAVAILABLE: constructed via debug_assume_success() slot B details UNAVAILABLE: constructed via debug_assume_success() last reconciled config: matches ledgered config + mupdate override present, but sled agent was not instructed to clear it no orphaned datasets all disks reconciled successfully all datasets reconciled successfully @@ -383,6 +385,7 @@ LEDGERED SLED CONFIG slot A details UNAVAILABLE: constructed via debug_assume_success() slot B details UNAVAILABLE: constructed via debug_assume_success() last reconciled config: matches ledgered config + mupdate override present, but sled agent was not instructed to clear it no orphaned datasets all disks reconciled successfully all datasets reconciled successfully diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index e0d06392ce..7c9aa4ab49 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -143,6 +143,10 @@ pub struct ConfigReconcilerInventory { pub orphaned_datasets: IdOrdMap, pub zones: BTreeMap, pub boot_partitions: BootPartitionContents, + /// The result of clearing the mupdate override field. + /// + /// `None` if `remove_mupdate_override` was not provided in the sled config. + pub clear_mupdate_override: Option, } impl ConfigReconcilerInventory { @@ -200,6 +204,17 @@ impl ConfigReconcilerInventory { .iter() .map(|z| (z.id, ConfigReconcilerInventoryResult::Ok)) .collect(); + let clear_mupdate_override = config.remove_mupdate_override.map(|_| { + ClearMupdateOverrideInventory { + boot_disk_result: Ok( + ClearMupdateOverrideBootSuccessInventory::Cleared, + ), + non_boot_message: "mupdate override successfully cleared \ + on non-boot disks" + .to_owned(), + } + }); + Self { last_reconciled_config: config, external_disks, @@ -216,6 +231,7 @@ impl ConfigReconcilerInventory { slot_b: Err(err), } }, + clear_mupdate_override, } } } @@ -277,6 +293,32 @@ impl IdOrdItem for OrphanedDataset { id_upcast!(); } +/// Status of clearing the mupdate override in the inventory. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)] +pub struct ClearMupdateOverrideInventory { + /// The result of clearing the mupdate override on the boot disk. + pub boot_disk_result: + Result, + + /// What happened on non-boot disks. + /// + /// We aren't modeling this out in more detail, because we plan to not try + /// and keep ledgered data in sync across both disks in the future. + pub non_boot_message: String, +} + +/// Status of clearing the mupdate override on the boot disk. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)] +pub enum ClearMupdateOverrideBootSuccessInventory { + /// The mupdate override was successfully cleared. + Cleared, + + /// No mupdate override was found. + /// + /// This is considered a success for idempotency reasons. + NoOverride, +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)] #[serde(tag = "result", rename_all = "snake_case")] pub enum ConfigReconcilerInventoryResult { diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 7c1ee230de..9fb9a03f9c 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -43,6 +43,8 @@ use nexus_db_schema::schema::{ }; use nexus_sled_agent_shared::inventory::BootImageHeader; use nexus_sled_agent_shared::inventory::BootPartitionDetails; +use nexus_sled_agent_shared::inventory::ClearMupdateOverrideBootSuccessInventory; +use nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; use nexus_sled_agent_shared::inventory::HostPhase2DesiredContents; use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots; @@ -955,6 +957,8 @@ pub struct InvSledConfigReconciler { boot_disk_error: Option, pub boot_partition_a_error: Option, pub boot_partition_b_error: Option, + #[diesel(embed)] + pub clear_mupdate_override: InvClearMupdateOverride, } impl InvSledConfigReconciler { @@ -965,6 +969,7 @@ impl InvSledConfigReconciler { boot_disk: Result, boot_partition_a_error: Option, boot_partition_b_error: Option, + clear_mupdate_override: InvClearMupdateOverride, ) -> Self { let (boot_disk_slot, boot_disk_error) = match boot_disk { Ok(M2Slot::A) => (Some(SqlU8(0)), None), @@ -980,6 +985,7 @@ impl InvSledConfigReconciler { boot_disk_error, boot_partition_a_error, boot_partition_b_error, + clear_mupdate_override, } } @@ -1019,6 +1025,104 @@ impl InvSledConfigReconciler { } } +// See [`nexus_sled_agent_shared::inventory::DbClearMupdateOverrideBootSuccess`]. +impl_enum_type!( + ClearMupdateOverrideBootSuccessEnum: + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + pub enum DbClearMupdateOverrideBootSuccess; + + // Enum values + Cleared => b"cleared" + NoOverride => b"no-override" +); + +impl From + for DbClearMupdateOverrideBootSuccess +{ + fn from(value: ClearMupdateOverrideBootSuccessInventory) -> Self { + match value { + ClearMupdateOverrideBootSuccessInventory::Cleared => Self::Cleared, + ClearMupdateOverrideBootSuccessInventory::NoOverride => { + Self::NoOverride + } + } + } +} + +impl From + for ClearMupdateOverrideBootSuccessInventory +{ + fn from(value: DbClearMupdateOverrideBootSuccess) -> Self { + match value { + DbClearMupdateOverrideBootSuccess::Cleared => Self::Cleared, + DbClearMupdateOverrideBootSuccess::NoOverride => Self::NoOverride, + } + } +} + +/// See [`nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory`]. +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_sled_config_reconciler)] +pub struct InvClearMupdateOverride { + #[diesel(column_name = clear_mupdate_override_boot_success)] + pub boot_success: Option, + + #[diesel(column_name = clear_mupdate_override_boot_error)] + pub boot_error: Option, + + #[diesel(column_name = clear_mupdate_override_non_boot_message)] + pub non_boot_message: Option, +} + +impl InvClearMupdateOverride { + pub fn new( + clear_mupdate_override: Option<&ClearMupdateOverrideInventory>, + ) -> Self { + let boot_success = clear_mupdate_override.and_then(|inv| { + inv.boot_disk_result.as_ref().ok().map(|v| v.clone().into()) + }); + let boot_error = clear_mupdate_override + .and_then(|inv| inv.boot_disk_result.as_ref().err().cloned()); + let non_boot_message = + clear_mupdate_override.map(|inv| inv.non_boot_message.clone()); + + Self { boot_success, boot_error, non_boot_message } + } + + pub fn into_inventory( + self, + ) -> anyhow::Result> { + match self { + Self { + boot_success: Some(success), + boot_error: None, + non_boot_message: Some(non_boot_message), + } => Ok(Some(ClearMupdateOverrideInventory { + boot_disk_result: Ok(success.into()), + non_boot_message, + })), + Self { + boot_success: None, + boot_error: Some(boot_error), + non_boot_message: Some(non_boot_message), + } => Ok(Some(ClearMupdateOverrideInventory { + boot_disk_result: Err(boot_error), + non_boot_message, + })), + Self { + boot_success: None, + boot_error: None, + non_boot_message: None, + } => Ok(None), + this => Err(anyhow!( + "inv_sled_config_reconciler CHECK constraint violated: \ + clear mupdate override columns are not consistent: {this:?}" + )), + } + } +} + /// See [`nexus_sled_agent_shared::inventory::BootPartitionDetails`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_sled_boot_partition)] diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 31ee927bd2..709f710b5e 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(164, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(165, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(165, "inv-clear-mupdate-override"), KnownVersion::new(164, "fix-leaked-bp-oximeter-read-policy-rows"), KnownVersion::new(163, "bp-desired-host-phase-2"), KnownVersion::new(162, "bundle-by-creation"), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 307ada9b48..744e1b0d0e 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -30,7 +30,6 @@ use iddqd::IdOrdMap; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; -use nexus_db_model::InvCaboose; use nexus_db_model::InvClickhouseKeeperMembership; use nexus_db_model::InvCockroachStatus; use nexus_db_model::InvCollection; @@ -69,6 +68,7 @@ use nexus_db_model::{ }; use nexus_db_model::{HwPowerState, InvZoneManifestNonBoot}; use nexus_db_model::{HwRotSlot, InvMupdateOverrideNonBoot}; +use nexus_db_model::{InvCaboose, InvClearMupdateOverride}; use nexus_db_schema::enums::HwRotSlotEnum; use nexus_db_schema::enums::RotImageErrorEnum; use nexus_db_schema::enums::RotPageWhichEnum; @@ -3531,6 +3531,13 @@ impl DataStore { BootPartitionContents { boot_disk, slot_a, slot_b } }; + let clear_mupdate_override = reconciler + .clear_mupdate_override + .into_inventory() + .map_err(|err| { + Error::internal_error(&format!("{err:#}")) + })?; + Ok::<_, Error>(ConfigReconcilerInventory { last_reconciled_config, external_disks: last_reconciliation_disk_results @@ -3547,6 +3554,7 @@ impl DataStore { .remove(&sled_id) .unwrap_or_default(), boot_partitions, + clear_mupdate_override, }) }) .transpose()?; @@ -3767,6 +3775,9 @@ impl ConfigReconcilerRows { )? }; last_reconciliation_config_id = Some(last_reconciled_config); + let clear_mupdate_override = InvClearMupdateOverride::new( + last_reconciliation.clear_mupdate_override.as_ref(), + ); self.config_reconcilers.push(InvSledConfigReconciler::new( collection_id, @@ -3785,6 +3796,7 @@ impl ConfigReconcilerRows { .as_ref() .err() .cloned(), + clear_mupdate_override, )); // Boot partition _errors_ are kept in `InvSledConfigReconciler` @@ -4033,10 +4045,13 @@ mod test { use nexus_inventory::examples::Representative; use nexus_inventory::examples::representative; use nexus_inventory::now_db_precision; - use nexus_sled_agent_shared::inventory::BootImageHeader; use nexus_sled_agent_shared::inventory::BootPartitionContents; use nexus_sled_agent_shared::inventory::BootPartitionDetails; use nexus_sled_agent_shared::inventory::OrphanedDataset; + use nexus_sled_agent_shared::inventory::{ + BootImageHeader, ClearMupdateOverrideBootSuccessInventory, + ClearMupdateOverrideInventory, + }; use nexus_sled_agent_shared::inventory::{ ConfigReconcilerInventory, ConfigReconcilerInventoryResult, ConfigReconcilerInventoryStatus, OmicronZoneImageSource, @@ -4898,6 +4913,15 @@ mod test { artifact_size: 456789, }), }, + clear_mupdate_override: Some( + ClearMupdateOverrideInventory { + boot_disk_result: Ok( + ClearMupdateOverrideBootSuccessInventory::Cleared, + ), + non_boot_message: "simulated non-boot message" + .to_owned(), + }, + ), } }); diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 2ee2f3ff6a..3adfdf4b70 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -33,6 +33,7 @@ define_enums! { BpZoneDispositionEnum => "bp_zone_disposition", BpZoneImageSourceEnum => "bp_zone_image_source", CabooseWhichEnum => "caboose_which", + ClearMupdateOverrideBootSuccessEnum => "clear_mupdate_override_boot_success", ClickhouseModeEnum => "clickhouse_mode", DatasetKindEnum => "dataset_kind", DnsGroupEnum => "dns_group", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 16c116c8e4..cfba9285ff 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1626,6 +1626,10 @@ table! { boot_partition_a_error -> Nullable, boot_partition_b_error -> Nullable, + + clear_mupdate_override_boot_success -> Nullable, + clear_mupdate_override_boot_error -> Nullable, + clear_mupdate_override_non_boot_message -> Nullable, } } diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 1d289007df..fbc8062743 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -2711,6 +2711,106 @@ fn after_164_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } +fn before_165_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + // Create test data in inv_sled_config_reconciler table before the new columns are added. + ctx.client + .execute( + "INSERT INTO omicron.public.inv_sled_config_reconciler + (inv_collection_id, sled_id, last_reconciled_config, boot_disk_slot) + VALUES + ('11111111-1111-1111-1111-111111111111', '22222222-2222-2222-2222-222222222222', + '33333333-3333-3333-3333-333333333333', 0);", + &[], + ) + .await + .expect("failed to insert pre-migration rows for 165"); + }) +} + +fn after_165_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + // After the migration, the new columns should exist and be NULL for existing rows. + let rows = ctx + .client + .query( + "SELECT + clear_mupdate_override_boot_success, + clear_mupdate_override_boot_error, + clear_mupdate_override_non_boot_message + FROM omicron.public.inv_sled_config_reconciler + WHERE sled_id = '22222222-2222-2222-2222-222222222222';", + &[], + ) + .await + .expect( + "failed to query post-migration inv_sled_config_reconciler", + ); + assert_eq!(rows.len(), 1); + + // All new columns should be NULL for existing rows. + let boot_success: Option = + (&rows[0]).get("clear_mupdate_override_boot_success"); + assert!(boot_success.is_none()); + + let boot_error: Option = + (&rows[0]).get("clear_mupdate_override_boot_error"); + assert!(boot_error.is_none()); + + let non_boot_message: Option = + (&rows[0]).get("clear_mupdate_override_non_boot_message"); + assert!(non_boot_message.is_none()); + + // Test that the constraint allows valid combinations. + // Case 1: All NULL (should work). + ctx.client + .execute( + "INSERT INTO omicron.public.inv_sled_config_reconciler + (inv_collection_id, sled_id, last_reconciled_config, boot_disk_slot, + clear_mupdate_override_boot_success, clear_mupdate_override_boot_error, + clear_mupdate_override_non_boot_message) + VALUES + ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '33333333-3333-3333-3333-333333333333', + 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 1, NULL, NULL, NULL);", + &[], + ) + .await + .expect("failed to insert row with all NULL clear_mupdate_override columns"); + + // Case 2: Success case (boot_success NOT NULL, boot_error NULL, non_boot_message NOT NULL). + ctx.client + .execute( + "INSERT INTO omicron.public.inv_sled_config_reconciler + (inv_collection_id, sled_id, last_reconciled_config, boot_disk_slot, + clear_mupdate_override_boot_success, clear_mupdate_override_boot_error, + clear_mupdate_override_non_boot_message) + VALUES + ('cccccccc-cccc-cccc-cccc-cccccccccccc', '44444444-4444-4444-4444-444444444444', + 'dddddddd-dddd-dddd-dddd-dddddddddddd', 0, + 'cleared', NULL, 'Non-boot disk cleared successfully');", + &[], + ) + .await + .expect("failed to insert row with success case clear_mupdate_override columns"); + + // Case 3: Error case (boot_success NULL, boot_error NOT NULL, non_boot_message NOT NULL). + ctx.client + .execute( + "INSERT INTO omicron.public.inv_sled_config_reconciler + (inv_collection_id, sled_id, last_reconciled_config, boot_disk_slot, + clear_mupdate_override_boot_success, clear_mupdate_override_boot_error, + clear_mupdate_override_non_boot_message) + VALUES + ('eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee', '55555555-5555-5555-5555-555555555555', + 'ffffffff-ffff-ffff-ffff-ffffffffffff', 1, + NULL, 'Failed to clear mupdate override', 'Non-boot disk operation failed');", + &[], + ) + .await + .expect("failed to insert row with error case clear_mupdate_override columns"); + }) +} + // Lazily initializes all migration checks. The combination of Rust function // pointers and async makes defining a static table fairly painful, so we're // using lazy initialization instead. @@ -2801,6 +2901,10 @@ fn get_migration_checks() -> BTreeMap { Version::new(164, 0, 0), DataMigrationFns::new().before(before_164_0_0).after(after_164_0_0), ); + map.insert( + Version::new(165, 0, 0), + DataMigrationFns::new().before(before_165_0_0).after(after_165_0_0), + ); map } diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 191df26fd9..5dbe9507df 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -19,9 +19,10 @@ use indent_write::fmt::IndentWriter; use itertools::Itertools; use nexus_sled_agent_shared::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, - ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - ConfigReconcilerInventoryStatus, HostPhase2DesiredContents, - OmicronSledConfig, OmicronZoneImageSource, OrphanedDataset, + ClearMupdateOverrideBootSuccessInventory, ConfigReconcilerInventory, + ConfigReconcilerInventoryResult, ConfigReconcilerInventoryStatus, + HostPhase2DesiredContents, OmicronSledConfig, OmicronZoneImageSource, + OrphanedDataset, }; use omicron_uuid_kinds::{ DatasetUuid, OmicronZoneUuid, PhysicalDiskUuid, ZpoolUuid, @@ -617,6 +618,7 @@ fn display_sleds( orphaned_datasets, zones, boot_partitions, + clear_mupdate_override, } = last_reconciliation; display_boot_partition_contents(boot_partitions, &mut indented)?; @@ -638,6 +640,65 @@ fn display_sleds( { let mut indent2 = IndentWriter::new(" ", &mut indented); + + if let Some(clear_mupdate_override) = clear_mupdate_override { + match &clear_mupdate_override.boot_disk_result { + Ok(ClearMupdateOverrideBootSuccessInventory::Cleared) => { + writeln!( + indent2, + "cleared mupdate override on boot disk", + )?; + } + Ok( + ClearMupdateOverrideBootSuccessInventory::NoOverride, + ) => { + writeln!( + indent2, + "attempted to clear mupdate override \ + on boot disk, but no override was set", + )?; + } + Err(message) => { + writeln!( + indent2, + "failed to clear mupdate override on boot disk: {}", + message + )?; + } + } + writeln!( + indent2, + "clear mupdate override on non-boot disk:" + )?; + + let mut indent3 = IndentWriter::new(" ", &mut indent2); + writeln!( + indent3, + "{}", + clear_mupdate_override.non_boot_message + )?; + } else { + match &zone_image_resolver.mupdate_override.boot_override { + Ok(Some(_)) => { + writeln!( + indent2, + "mupdate override present, but sled agent was not \ + instructed to clear it" + )?; + } + Ok(None) => { + writeln!(indent2, "no mupdate override to clear")?; + } + Err(_) => { + writeln!( + indent2, + "error reading mupdate override, so sled agent was \ + not instructed to clear it" + )?; + } + } + } + if orphaned_datasets.is_empty() { writeln!(indent2, "no orphaned datasets")?; } else { diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9b90c2b8e8..fc428bda6a 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3704,6 +3704,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( PRIMARY KEY (inv_collection_id, sled_id) ); +CREATE TYPE IF NOT EXISTS omicron.public.clear_mupdate_override_boot_success +AS ENUM ( + 'cleared', + 'no-override' +); + CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_config_reconciler ( -- where this observation came from -- (foreign key into `inv_collection` table) @@ -3741,6 +3747,37 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_config_reconciler ( boot_partition_a_error TEXT, boot_partition_b_error TEXT, + -- Success clearing the mupdate override. + clear_mupdate_override_boot_success omicron.public.clear_mupdate_override_boot_success, + -- Error clearing the mupdate override. + clear_mupdate_override_boot_error TEXT, + + -- A message describing the result clearing the mupdate override on the + -- non-boot disk. + clear_mupdate_override_non_boot_message TEXT, + + -- Three cases: + -- + -- 1. No clear_mupdate_override instruction was passed in. All three + -- columns are NULL. + -- 2. Clearing the override was successful. boot_success is NOT NULL, + -- boot_error is NULL, and non_boot_message is NOT NULL. + -- 3. Clearing the override failed. boot_success is NULL, boot_error is + -- NOT NULL, and non_boot_message is NOT NULL. + CONSTRAINT clear_mupdate_override_consistency CHECK ( + (clear_mupdate_override_boot_success IS NULL + AND clear_mupdate_override_boot_error IS NULL + AND clear_mupdate_override_non_boot_message IS NULL) + OR + (clear_mupdate_override_boot_success IS NOT NULL + AND clear_mupdate_override_boot_error IS NULL + AND clear_mupdate_override_non_boot_message IS NOT NULL) + OR + (clear_mupdate_override_boot_success IS NULL + AND clear_mupdate_override_boot_error IS NOT NULL + AND clear_mupdate_override_non_boot_message IS NOT NULL) + ), + PRIMARY KEY (inv_collection_id, sled_id) ); @@ -6209,7 +6246,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '164.0.0', NULL) + (TRUE, NOW(), NOW(), '165.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/inv-clear-mupdate-override/up01.sql b/schema/crdb/inv-clear-mupdate-override/up01.sql new file mode 100644 index 0000000000..49287811f7 --- /dev/null +++ b/schema/crdb/inv-clear-mupdate-override/up01.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.clear_mupdate_override_boot_success +AS ENUM ( + 'cleared', + 'no-override' +); diff --git a/schema/crdb/inv-clear-mupdate-override/up02.sql b/schema/crdb/inv-clear-mupdate-override/up02.sql new file mode 100644 index 0000000000..25e37e244e --- /dev/null +++ b/schema/crdb/inv-clear-mupdate-override/up02.sql @@ -0,0 +1,4 @@ +ALTER TABLE omicron.public.inv_sled_config_reconciler + ADD COLUMN IF NOT EXISTS clear_mupdate_override_boot_success omicron.public.clear_mupdate_override_boot_success, + ADD COLUMN IF NOT EXISTS clear_mupdate_override_boot_error TEXT, + ADD COLUMN IF NOT EXISTS clear_mupdate_override_non_boot_message TEXT; diff --git a/schema/crdb/inv-clear-mupdate-override/up03.sql b/schema/crdb/inv-clear-mupdate-override/up03.sql new file mode 100644 index 0000000000..98ebb4ee3b --- /dev/null +++ b/schema/crdb/inv-clear-mupdate-override/up03.sql @@ -0,0 +1,14 @@ +ALTER TABLE omicron.public.inv_sled_config_reconciler + ADD CONSTRAINT IF NOT EXISTS clear_mupdate_override_consistency CHECK ( + (clear_mupdate_override_boot_success IS NULL + AND clear_mupdate_override_boot_error IS NULL + AND clear_mupdate_override_non_boot_message IS NULL) + OR + (clear_mupdate_override_boot_success IS NOT NULL + AND clear_mupdate_override_boot_error IS NULL + AND clear_mupdate_override_non_boot_message IS NOT NULL) + OR + (clear_mupdate_override_boot_success IS NULL + AND clear_mupdate_override_boot_error IS NOT NULL + AND clear_mupdate_override_non_boot_message IS NOT NULL) + ); diff --git a/sled-agent/config-reconciler/src/internal_disks.rs b/sled-agent/config-reconciler/src/internal_disks.rs index 5702a8ed66..75c561d692 100644 --- a/sled-agent/config-reconciler/src/internal_disks.rs +++ b/sled-agent/config-reconciler/src/internal_disks.rs @@ -431,19 +431,24 @@ impl InternalDisksWithBootDisk { ) } + pub fn non_boot_disk_zpool_ids( + &self, + ) -> impl Iterator + '_ { + self.inner.disks.iter().filter_map(|disk| { + (disk.id != self.boot_disk).then_some(disk.zpool_id) + }) + } + pub fn non_boot_disk_install_datasets( &self, ) -> impl Iterator + '_ { - self.inner.disks.iter().filter(|disk| disk.id != self.boot_disk).map( - |disk| { - let dataset = ZpoolName::Internal(disk.zpool_id) - .dataset_mountpoint( - &self.inner.mount_config.root, - INSTALL_DATASET, - ); - (disk.zpool_id, dataset) - }, - ) + self.non_boot_disk_zpool_ids().map(|zpool_id| { + let dataset = ZpoolName::Internal(zpool_id).dataset_mountpoint( + &self.inner.mount_config.root, + INSTALL_DATASET, + ); + (zpool_id, dataset) + }) } } diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index 33bc8f7dc1..a6eb8e6bd6 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -13,6 +13,7 @@ use illumos_utils::zpool::PathInPool; use illumos_utils::zpool::ZpoolOrRamdisk; use key_manager::StorageKeyRequester; use nexus_sled_agent_shared::inventory::BootPartitionContents as BootPartitionContentsInventory; +use nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; @@ -213,6 +214,7 @@ struct LatestReconciliationResult { zones_inventory: BTreeMap, timesync_status: TimeSyncStatus, boot_partitions: BootPartitionContentsInventory, + clear_mupdate_override: Option, } impl LatestReconciliationResult { @@ -224,6 +226,7 @@ impl LatestReconciliationResult { orphaned_datasets: self.orphaned_datasets.clone(), zones: self.zones_inventory.clone(), boot_partitions: self.boot_partitions.clone(), + clear_mupdate_override: self.clear_mupdate_override.clone(), } } @@ -424,6 +427,23 @@ impl ReconcilerTask { } }; + // Reconcile the mupdate override field. This can be done independently + // of the other parts of reconciliation (and this doesn't have to block + // other parts of reconciliation), but the argument for this is somewhat + // non-trivial. See + // https://rfd.shared.oxide.computer/rfd/556#sa_reconciler_error_handling. + let clear_mupdate_override = + if let Some(override_id) = sled_config.remove_mupdate_override { + let internal_disks = + self.internal_disks_rx.wait_for_boot_disk().await; + Some( + sled_agent_facilities + .clear_mupdate_override(override_id, internal_disks), + ) + } else { + None + }; + // Reconcile any changes to our boot partitions. This is typically a // no-op; if we've successfully read both boot partitions in a previous // reconciliation and don't have new contents to write, it will just @@ -562,6 +582,8 @@ impl ReconcilerTask { zones_inventory: self.zones.to_inventory(), timesync_status, boot_partitions: boot_partitions.into_inventory(), + clear_mupdate_override: clear_mupdate_override + .map(|v| v.to_inventory()), }; self.reconciler_result_tx.send_modify(|r| { r.status = ReconcilerTaskStatus::Idle { diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 76e0f6f11b..3fd7ed003d 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -1077,6 +1077,7 @@ fn config_differs_only_by_image_source( #[cfg(test)] mod tests { use super::*; + use crate::InternalDisksWithBootDisk; use crate::dataset_serialization_task::DatasetEnsureError; use anyhow::anyhow; use assert_matches::assert_matches; @@ -1102,8 +1103,11 @@ mod tests { use omicron_common::update::OmicronZoneManifestSource; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; + use omicron_uuid_kinds::MupdateOverrideUuid; use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::zone_images::ArtifactReadResult; + use sled_agent_types::zone_images::ClearMupdateOverrideBootSuccess; + use sled_agent_types::zone_images::ClearMupdateOverrideResult; use sled_agent_types::zone_images::MupdateOverrideStatus; use sled_agent_types::zone_images::ResolverStatus; use sled_agent_types::zone_images::ZoneManifestArtifactResult; @@ -1230,6 +1234,8 @@ mod tests { } } + const BOOT_DISK_PATH: &str = "/test/boot/disk"; + #[derive(Debug)] struct FakeSledAgentFacilitiesInner { start_responses: VecDeque>, @@ -1239,7 +1245,7 @@ mod tests { impl Default for FakeSledAgentFacilitiesInner { fn default() -> Self { - let boot_disk_path = Utf8PathBuf::from("/test/boot/disk"); + let boot_disk_path = Utf8PathBuf::from(BOOT_DISK_PATH); Self { start_responses: Default::default(), removed_ddm_prefixes: Default::default(), @@ -1316,6 +1322,22 @@ mod tests { self.inner.lock().unwrap().resolver_status.clone() } + fn clear_mupdate_override( + &self, + _override_id: MupdateOverrideUuid, + _internal_disks: InternalDisksWithBootDisk, + ) -> ClearMupdateOverrideResult { + // TODO: In the future, we'll probably want to model this better and + // not just return no-override. + ClearMupdateOverrideResult { + boot_disk_path: Utf8PathBuf::from(BOOT_DISK_PATH), + boot_disk_result: Ok( + ClearMupdateOverrideBootSuccess::NoOverride, + ), + non_boot_disk_info: IdOrdMap::new(), + } + } + fn metrics_untrack_zone_links( &self, _zone: &RunningZone, diff --git a/sled-agent/config-reconciler/src/sled_agent_facilities.rs b/sled-agent/config-reconciler/src/sled_agent_facilities.rs index a6f3d4e858..4f87cd8f71 100644 --- a/sled-agent/config-reconciler/src/sled_agent_facilities.rs +++ b/sled-agent/config-reconciler/src/sled_agent_facilities.rs @@ -11,11 +11,15 @@ use illumos_utils::zpool::PathInPool; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; +use omicron_uuid_kinds::MupdateOverrideUuid; use sled_agent_types::zone_bundle::ZoneBundleCause; +use sled_agent_types::zone_images::ClearMupdateOverrideResult; use sled_agent_types::zone_images::ResolverStatus; use std::future::Future; use tufaceous_artifact::ArtifactHash; +use crate::InternalDisksWithBootDisk; + pub trait SledAgentFacilities: Send + Sync + 'static { /// The underlay VNIC interface in the global zone. /// @@ -40,6 +44,13 @@ pub trait SledAgentFacilities: Send + Sync + 'static { /// Get the status of the zone image resolver. fn zone_image_resolver_status(&self) -> ResolverStatus; + /// Clear out the mupdate override + fn clear_mupdate_override( + &self, + override_id: MupdateOverrideUuid, + internal_disks: InternalDisksWithBootDisk, + ) -> ClearMupdateOverrideResult; + /// Stop tracking metrics for a zone's datalinks. fn metrics_untrack_zone_links( &self, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index c30bf21e98..b77a918f41 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -49,11 +49,14 @@ use omicron_common::backoff::{ BackoffError, retry_notify, retry_policy_internal_service_aggressive, }; use omicron_ddm_admin_client::Client as DdmAdminClient; -use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid}; +use omicron_uuid_kinds::{ + GenericUuid, MupdateOverrideUuid, PropolisUuid, SledUuid, +}; use sled_agent_config_reconciler::{ ConfigReconcilerHandle, ConfigReconcilerSpawnToken, InternalDisksReceiver, - LedgerNewConfigError, LedgerTaskError, ReconcilerInventory, - SledAgentArtifactStore, SledAgentFacilities, TimeSyncStatus, + InternalDisksWithBootDisk, LedgerNewConfigError, LedgerTaskError, + ReconcilerInventory, SledAgentArtifactStore, SledAgentFacilities, + TimeSyncStatus, }; use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfig; @@ -67,7 +70,9 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, PriorityOrder, StorageLimit, ZoneBundleCause, ZoneBundleMetadata, }; -use sled_agent_types::zone_images::ResolverStatus; +use sled_agent_types::zone_images::{ + ClearMupdateOverrideResult, ResolverStatus, +}; use sled_diagnostics::SledDiagnosticsCmdError; use sled_diagnostics::SledDiagnosticsCmdOutput; use sled_hardware::{HardwareManager, MemoryReservations, underlay}; @@ -1334,6 +1339,16 @@ impl SledAgentFacilities for ReconcilerFacilities { self.service_manager.zone_image_resolver().status() } + fn clear_mupdate_override( + &self, + override_id: MupdateOverrideUuid, + internal_disks: InternalDisksWithBootDisk, + ) -> ClearMupdateOverrideResult { + self.service_manager + .zone_image_resolver() + .clear_mupdate_override(override_id, internal_disks) + } + fn metrics_untrack_zone_links( &self, zone: &RunningZone, diff --git a/sled-agent/types/Cargo.toml b/sled-agent/types/Cargo.toml index a61a14508f..10aab7af38 100644 --- a/sled-agent/types/Cargo.toml +++ b/sled-agent/types/Cargo.toml @@ -32,6 +32,7 @@ sled-hardware-types.workspace = true slog.workspace = true slog-error-chain.workspace = true strum.workspace = true +swrite.workspace = true thiserror.workspace = true toml.workspace = true tufaceous-artifact.workspace = true diff --git a/sled-agent/types/src/zone_images.rs b/sled-agent/types/src/zone_images.rs index a3547b37b4..4ec8f2ea26 100644 --- a/sled-agent/types/src/zone_images.rs +++ b/sled-agent/types/src/zone_images.rs @@ -6,6 +6,8 @@ use std::{fmt, fs::FileType, io, sync::Arc}; use camino::Utf8PathBuf; use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use nexus_sled_agent_shared::inventory::ClearMupdateOverrideBootSuccessInventory; +use nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory; use nexus_sled_agent_shared::inventory::MupdateOverrideBootInventory; use nexus_sled_agent_shared::inventory::MupdateOverrideInventory; use nexus_sled_agent_shared::inventory::MupdateOverrideNonBootInventory; @@ -19,8 +21,10 @@ use omicron_common::update::{ MupdateOverrideInfo, OmicronZoneManifest, OmicronZoneManifestSource, }; use omicron_uuid_kinds::InternalZpoolUuid; -use slog::{info, o, warn}; +use omicron_uuid_kinds::MupdateOverrideUuid; +use slog::{error, info, o, warn}; use slog_error_chain::InlineErrorChain; +use swrite::{SWrite, swriteln}; use thiserror::Error; use tufaceous_artifact::ArtifactHash; @@ -730,6 +734,309 @@ pub enum ArtifactReadResult { Error(ArcIoError), } +/// The result of an operation to clear MUPdate overrides on a sled's boot disk. +#[derive(Clone, Debug)] +pub struct ClearMupdateOverrideResult { + /// The path to the override on the boot disk. + pub boot_disk_path: Utf8PathBuf, + + /// The result of clearing the mupdate override on the boot disk. + pub boot_disk_result: + Result, + + /// The result of clearing the mupdate override on non-boot disks. + pub non_boot_disk_info: IdOrdMap, +} + +impl ClearMupdateOverrideResult { + pub fn to_inventory(&self) -> ClearMupdateOverrideInventory { + let boot_disk_result = match &self.boot_disk_result { + Ok(ClearMupdateOverrideBootSuccess::Cleared(_)) => { + Ok(ClearMupdateOverrideBootSuccessInventory::Cleared) + } + Ok(ClearMupdateOverrideBootSuccess::NoOverride) => { + Ok(ClearMupdateOverrideBootSuccessInventory::NoOverride) + } + Err(error) => Err(InlineErrorChain::new(error).to_string()), + }; + + let mut non_boot_message = String::new(); + for info in &self.non_boot_disk_info { + swriteln!( + non_boot_message, + "- for non-boot disk {}: {}", + info.zpool_id, + info.result.display(), + ); + } + + ClearMupdateOverrideInventory { boot_disk_result, non_boot_message } + } + + pub fn log_to(&self, log: &slog::Logger) { + let log = + log.new(o!("boot_disk_path" => self.boot_disk_path.to_string())); + match &self.boot_disk_result { + Ok(info) => { + info!( + log, + "cleared mupdate override on boot disk"; + "prev_info" => ?info, + ); + } + Err(error) => { + error!( + log, + "failed to clear mupdate override on boot disk"; + "error" => InlineErrorChain::new(error), + ); + } + } + + for info in &self.non_boot_disk_info { + info.log_to(&log); + } + } +} + +/// A success condition clearing the mupdate override on a boot disk. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum ClearMupdateOverrideBootSuccess { + /// The mupdate override was matched up and successfully cleared. + Cleared(MupdateOverrideInfo), + + /// No mupdate override was found. + /// + /// This is considered a success condition for idempotency reasons. + NoOverride, +} + +#[derive(Clone, Debug, Error, PartialEq)] +pub enum ClearMupdateOverrideBootError { + #[error( + "mismatch between override ID on boot disk ({actual}) \ + and provided ID ({provided})" + )] + IdMismatch { + /// The actual override ID on the boot disk. + actual: MupdateOverrideUuid, + + /// The override ID provided to the `clear_mupdate_override` method. + provided: MupdateOverrideUuid, + }, + #[error("error removing mupdate override file at `{path}`")] + RemoveError { + /// The path to the mupdate override file that could not be removed. + path: Utf8PathBuf, + + /// The underlying error. + #[source] + error: ArcIoError, + }, + #[error( + "mupdate override at `{path}` not cleared since there was an error reading it" + )] + ReadError { + /// The path to the mupdate override file that could not be read. + path: Utf8PathBuf, + + /// The underlying error. + #[source] + error: MupdateOverrideReadError, + }, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct ClearMupdateOverrideNonBootInfo { + /// The zpool ID of the non-boot disk. + pub zpool_id: InternalZpoolUuid, + + /// The path to the mupdate override file on the disk, or None if no status + /// was available. + pub path: Option, + + /// The result of clearing the mupdate override on the non-boot disk. + pub result: ClearMupdateOverrideNonBootResult, +} + +impl ClearMupdateOverrideNonBootInfo { + pub fn log_to(&self, log: &slog::Logger) { + let log = log.new(o!( + "non_boot_zpool_id" => self.zpool_id.to_string(), + "non_boot_path" => self.path.as_ref().map_or_else( + || "(none)".to_owned(), + |path| path.to_string() + ), + )); + + self.result.log_to(&log); + } +} + +impl IdOrdItem for ClearMupdateOverrideNonBootInfo { + type Key<'a> = InternalZpoolUuid; + + fn key(&self) -> Self::Key<'_> { + self.zpool_id + } + + id_upcast!(); +} + +#[derive(Clone, Debug, PartialEq)] +pub enum ClearMupdateOverrideNonBootResult { + /// The mupdate override was present and was cleared successfully. + Cleared { + /// The previous mupdate override result that was cleared. This could + /// potentially be an invalid override. + prev_result: MupdateOverrideNonBootResult, + }, + + /// No mupdate override was found on the non-boot disk. + NoOverride, + + /// There was an error clearing the mupdate override on the boot disk, so + /// the non-boot disk was not altered. + BootDiskError, + + /// An error occurred while clearing the mupdate override on the non-boot + /// disk. + RemoveError { + /// The path to the MUPdate override file that could not be removed. + path: Utf8PathBuf, + + /// The error that occurred. + error: ArcIoError, + }, + + /// An error occurred while reading the mupdate override on the non-boot + /// disk. + ReadError { + /// The path to the MUPdate override file that could not be read. + path: Utf8PathBuf, + + /// The error that occurred. + error: MupdateOverrideReadError, + }, + + /// No status was found for the non-boot disk, possibly indicating the + /// non-boot disk being missing at the time Sled Agent was started. + NoStatus, + + /// The disk was missing from the latest InternalDisksWithBootDisk but was + /// present at startup. The on-disk data was not altered. + DiskMissing, +} + +impl ClearMupdateOverrideNonBootResult { + pub fn display(&self) -> ClearMupdateOverrideNonBootDisplay<'_> { + ClearMupdateOverrideNonBootDisplay { result: self } + } + + fn log_to(&self, log: &slog::Logger) { + match self { + ClearMupdateOverrideNonBootResult::Cleared { prev_result } => { + info!( + log, + "cleared mupdate override on non-boot disk"; + "prev_result" => %prev_result.display(), + ); + } + ClearMupdateOverrideNonBootResult::BootDiskError => { + warn!( + log, + "mupdate override on non-boot disk not cleared due to \ + boot disk error" + ); + } + ClearMupdateOverrideNonBootResult::RemoveError { path, error } => { + warn!( + log, + "error removing mupdate override file on non-boot disk"; + "path" => %path, + "error" => InlineErrorChain::new(error), + ); + } + ClearMupdateOverrideNonBootResult::ReadError { path, error } => { + warn!( + log, + "error reading mupdate override file on non-boot disk"; + "path" => %path, + "error" => InlineErrorChain::new(error), + ); + } + ClearMupdateOverrideNonBootResult::NoStatus => { + warn!( + log, + "no status available for non-boot disk when sled-agent \ + started, mupdate override not cleared" + ); + } + ClearMupdateOverrideNonBootResult::DiskMissing => { + warn!( + log, + "non-boot disk missing from latest InternalDisks, \ + mupdate override not cleared" + ); + } + ClearMupdateOverrideNonBootResult::NoOverride => { + warn!( + log, + "no mupdate override found on non-boot disk to clear" + ); + } + } + } +} + +pub struct ClearMupdateOverrideNonBootDisplay<'a> { + result: &'a ClearMupdateOverrideNonBootResult, +} + +impl fmt::Display for ClearMupdateOverrideNonBootDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.result { + ClearMupdateOverrideNonBootResult::Cleared { prev_result } => { + write!(f, "cleared (previous: {})", prev_result.display()) + } + ClearMupdateOverrideNonBootResult::BootDiskError => { + write!(f, "not cleared due to boot disk error") + } + ClearMupdateOverrideNonBootResult::RemoveError { path, error } => { + write!( + f, + "error removing file at `{path}`: {}", + InlineErrorChain::new(error), + ) + } + ClearMupdateOverrideNonBootResult::ReadError { path, error } => { + write!( + f, + "error reading file at `{path}`: {}", + InlineErrorChain::new(error), + ) + } + ClearMupdateOverrideNonBootResult::NoStatus => { + write!( + f, + "no status was available when sled-agent was started, \ + so mupdate override not cleared" + ) + } + ClearMupdateOverrideNonBootResult::DiskMissing => { + write!( + f, + "non-boot disk missing from latest InternalDisks, \ + mupdate override not cleared" + ) + } + ClearMupdateOverrideNonBootResult::NoOverride => { + write!(f, "no override to clear") + } + } + } +} + #[derive(Clone, Debug, PartialEq, Error)] pub enum InstallMetadataReadError { #[error( diff --git a/sled-agent/zone-images/src/mupdate_override.rs b/sled-agent/zone-images/src/mupdate_override.rs index cfc565c545..34b37837e4 100644 --- a/sled-agent/zone-images/src/mupdate_override.rs +++ b/sled-agent/zone-images/src/mupdate_override.rs @@ -6,6 +6,8 @@ //! //! For more about commingling MUPdate and update, see RFD 556. +use std::fs; + use crate::AllInstallMetadataFiles; use crate::InstallMetadataNonBootInfo; use crate::InstallMetadataNonBootMismatch; @@ -14,7 +16,14 @@ use camino::Utf8PathBuf; use iddqd::IdOrdMap; use omicron_common::update::MupdateOverrideInfo; use omicron_uuid_kinds::InternalZpoolUuid; +use omicron_uuid_kinds::MupdateOverrideUuid; use sled_agent_config_reconciler::InternalDisksWithBootDisk; +use sled_agent_types::zone_images::ArcIoError; +use sled_agent_types::zone_images::ClearMupdateOverrideBootError; +use sled_agent_types::zone_images::ClearMupdateOverrideBootSuccess; +use sled_agent_types::zone_images::ClearMupdateOverrideNonBootInfo; +use sled_agent_types::zone_images::ClearMupdateOverrideNonBootResult; +use sled_agent_types::zone_images::ClearMupdateOverrideResult; use sled_agent_types::zone_images::MupdateOverrideNonBootInfo; use sled_agent_types::zone_images::MupdateOverrideNonBootMismatch; use sled_agent_types::zone_images::MupdateOverrideNonBootResult; @@ -84,6 +93,166 @@ impl AllMupdateOverrides { } } + pub(crate) fn clear_override( + &mut self, + override_id: MupdateOverrideUuid, + internal_disks: InternalDisksWithBootDisk, + ) -> ClearMupdateOverrideResult { + let boot_disk_result = match self.boot_disk_override.clone() { + Ok(Some(info)) if info.mupdate_uuid == override_id => { + // Remove the override from the boot disk (which is fallible) + // before clearing it in-memory (which is infallible). + match fs::remove_file(&self.boot_disk_path) { + Ok(()) => { + // Remove the in-memory override. + self.boot_disk_override = Ok(None); + Ok(ClearMupdateOverrideBootSuccess::Cleared(info)) + } + Err(error) => { + Err(ClearMupdateOverrideBootError::RemoveError { + path: self.boot_disk_path.clone(), + error: ArcIoError::new(error), + }) + } + } + } + Ok(Some(info)) => { + // The override ID does not match the boot disk override, so we + // shouldn't clear it. + Err(ClearMupdateOverrideBootError::IdMismatch { + actual: info.mupdate_uuid, + provided: override_id, + }) + } + Ok(None) => { + // There is no override on the boot disk, which indicates that + // the override was cleared in a prior attempt. We accept this + // for idempotency reasons. + Ok(ClearMupdateOverrideBootSuccess::NoOverride) + } + Err(error) => { + // If the mupdate override couldn't be read in the first place, + // we don't have enough information to determine if it should be + // cleared. Don't clear it. + Err(ClearMupdateOverrideBootError::ReadError { + path: self.boot_disk_path.clone(), + error, + }) + } + }; + + // Iterate over non-boot disks, clearing overrides if they exist. + let mut non_boot_disk_info = IdOrdMap::new(); + for zpool_id in internal_disks.non_boot_disk_zpool_ids() { + let (path, clear_result) = + match self.non_boot_disk_overrides.get_mut(&zpool_id) { + Some(mut info) => { + let (new_result, clear_result) = + clear_non_boot_disk(&boot_disk_result, &info); + info.result = new_result; + (Some(info.path.clone()), clear_result) + } + None => { + // No status was found on the non-boot disk. + (None, ClearMupdateOverrideNonBootResult::NoStatus) + } + }; + non_boot_disk_info + .insert_unique(ClearMupdateOverrideNonBootInfo { + zpool_id, + path, + result: clear_result, + }) + .expect("non-boot zpool IDs should be unique"); + } + + // Are there any non-boot disks that were originally read at startup but + // are missing from InternalDisksWithBootDisk? + for mut non_boot_disk_override in &mut self.non_boot_disk_overrides { + if !non_boot_disk_info + .contains_key(&non_boot_disk_override.zpool_id) + { + // If the boot disk was successfully cleared, we may have + // introduced a mismatch. + if let Ok(ClearMupdateOverrideBootSuccess::Cleared( + boot_disk_info, + )) = &boot_disk_result + { + let new_result = match &non_boot_disk_override.result { + MupdateOverrideNonBootResult::MatchesPresent => { + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + non_boot_disk_info: boot_disk_info.clone(), + }, + ) + } + MupdateOverrideNonBootResult::MatchesAbsent => { + unreachable!( + "boot disk absent means that \ + boot_disk_result is always an error" + ) + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootPresentOtherAbsent + ) => { + // The mupdate override file is now absent from both + // the boot and the non-boot disk, so this goes from + // mismatch to MatchesAbsent. + MupdateOverrideNonBootResult::MatchesAbsent + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + .. + }, + ) => { + unreachable!( + "boot disk absent means that \ + boot_disk_result is always an error" + ) + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::ValueMismatch { non_boot_disk_info }, + ) => { + // Goes to BootAbsentOtherPresent. + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + non_boot_disk_info: non_boot_disk_info.clone(), + } + ) + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootDiskReadError { .. }, + ) => { + unreachable!( + "boot disk read error means that \ + boot_disk_result is always an error" + ) + } + MupdateOverrideNonBootResult::ReadError(_) + => { + non_boot_disk_override.result.clone() + } + }; + + non_boot_disk_override.result = new_result; + } + non_boot_disk_info + .insert_unique(ClearMupdateOverrideNonBootInfo { + zpool_id: non_boot_disk_override.zpool_id, + path: Some(non_boot_disk_override.path.clone()), + result: ClearMupdateOverrideNonBootResult::DiskMissing, + }) + .expect("non-boot zpool IDs should be unique"); + } + } + + ClearMupdateOverrideResult { + boot_disk_path: self.boot_disk_path.clone(), + boot_disk_result, + non_boot_disk_info, + } + } + fn log_results(&self, log: &slog::Logger) { let log = log.new(o!( "component" => "mupdate_override", @@ -181,12 +350,136 @@ fn make_non_boot_info( } } +fn clear_non_boot_disk( + boot_disk_result: &Result< + ClearMupdateOverrideBootSuccess, + ClearMupdateOverrideBootError, + >, + info: &MupdateOverrideNonBootInfo, +) -> (MupdateOverrideNonBootResult, ClearMupdateOverrideNonBootResult) { + match &info.result { + MupdateOverrideNonBootResult::MatchesPresent => { + if boot_disk_result.is_ok() { + // Try removing the file. + remove_non_boot_file(info) + } else { + // There was an error removing the boot disk, so don't alter the + // non-boot disk. + ( + info.result.clone(), + ClearMupdateOverrideNonBootResult::BootDiskError, + ) + } + } + MupdateOverrideNonBootResult::MatchesAbsent => { + // No action needed -- the status stays the same. + (info.result.clone(), ClearMupdateOverrideNonBootResult::NoOverride) + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootPresentOtherAbsent, + ) => { + // The file doesn't exist on disk, so the clear result is always + // NoOverride. The new result depends on whether the override was + // cleared on the boot disk. + let new_result = if boot_disk_result.is_ok() { + MupdateOverrideNonBootResult::MatchesAbsent + } else { + info.result.clone() + }; + let clear_result = ClearMupdateOverrideNonBootResult::NoOverride; + (new_result, clear_result) + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { .. }, + ) => { + let success = boot_disk_result.as_ref().expect( + "boot disk override being absent is a success condition", + ); + assert!( + matches!(success, ClearMupdateOverrideBootSuccess::NoOverride), + "success condition should be NoOverride, \ + but instead was {success:?}" + ); + + // Always remove the mupdate override on the non-boot disk. Since + // the boot disk is always authoritative, this is a chance for us to + // bring the non-boot disk into alignment. + remove_non_boot_file(info) + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::ValueMismatch { .. }, + ) => { + if boot_disk_result.is_ok() { + // Clear out the mupdate override. + remove_non_boot_file(info) + } else { + // There was an error removing the boot disk, so don't alter the + // non-boot disk. + ( + info.result.clone(), + ClearMupdateOverrideNonBootResult::BootDiskError, + ) + } + } + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootDiskReadError { .. }, + ) => { + // There was an error reading the boot disk's mupdate override. The + // safest thing to do is to not alter the non-boot disk. + ( + info.result.clone(), + ClearMupdateOverrideNonBootResult::BootDiskError, + ) + } + MupdateOverrideNonBootResult::ReadError(error) => { + // Don't alter the non-boot disk on error. + ( + info.result.clone(), + ClearMupdateOverrideNonBootResult::ReadError { + path: info.path.clone(), + error: error.clone(), + }, + ) + } + } +} + +/// Removes a non-boot mupdate override file, assuming that the boot mupdate +/// override file was successfully removed and that the non-boot file currently +/// exists. +fn remove_non_boot_file( + info: &MupdateOverrideNonBootInfo, +) -> (MupdateOverrideNonBootResult, ClearMupdateOverrideNonBootResult) { + match fs::remove_file(&info.path) { + Ok(()) => { + // The new status is now MatchesAbsent. + let new_result = MupdateOverrideNonBootResult::MatchesAbsent; + let clear_result = ClearMupdateOverrideNonBootResult::Cleared { + prev_result: info.result.clone(), + }; + (new_result, clear_result) + } + Err(error) => { + let new_result = info.result.clone(); + let clear_result = ClearMupdateOverrideNonBootResult::RemoveError { + path: info.path.clone(), + error: ArcIoError::new(error), + }; + (new_result, clear_result) + } + } +} + #[cfg(test)] mod tests { + use std::io; + use std::os::unix::fs::PermissionsExt; + use super::*; use crate::test_utils::make_internal_disks_rx; + use camino::Utf8Path; use camino_tempfile_ext::prelude::*; use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; @@ -605,4 +898,805 @@ mod tests { logctx.cleanup_successful(); } + + /// Test clearing the override with the boot disk being successful and the + /// non-boot disk being MatchesPresent (the most common case). + #[test] + fn clear_boot_success_non_boot_matches_present() { + let logctx = LogContext::new( + "mupdate_override_clear_boot_success_non_boot_matches_present", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + cx.write_to(&dir.child(&NON_BOOT_PATHS.install_dataset)).unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + let result = + overrides.clear_override(info.mupdate_uuid, internal_disks); + + // The boot disk should be cleared. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::Cleared(info)) + ); + + // The non-boot disk should be cleared too. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::Cleared { + prev_result: MupdateOverrideNonBootResult::MatchesPresent, + }, + } + } + ); + + // Verify that both files were removed. + assert!(!dir.child(&BOOT_PATHS.mupdate_override_json).exists()); + assert!(!dir.child(&NON_BOOT_PATHS.mupdate_override_json).exists()); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with an ID mismatch on the boot disk and the + /// non-boot disk being MatchesPresent. + #[test] + fn clear_boot_mismatch_non_boot_matches_present() { + let logctx = LogContext::new( + "mupdate_override_clear_boot_mismatch_non_boot_matches_present", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + cx.write_to(&dir.child(&NON_BOOT_PATHS.install_dataset)).unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // Use a different UUID to cause an error clearing the override. + let other_uuid = MupdateOverrideUuid::new_v4(); + let result = overrides.clear_override(other_uuid, internal_disks); + + // The boot disk should get an ID mismatch error. + assert_eq!( + result.boot_disk_result, + Err(ClearMupdateOverrideBootError::IdMismatch { + actual: info.mupdate_uuid, + provided: other_uuid, + }) + ); + + // The non-boot disk should not be altered because clearing the ID + // failed on the boot disk. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::BootDiskError, + } + } + ); + + // Verify that both files still exist. + assert!(dir.child(&BOOT_PATHS.mupdate_override_json).exists()); + assert!(dir.child(&NON_BOOT_PATHS.mupdate_override_json).exists()); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with non-boot disk being MatchesAbsent. + #[test] + fn clear_non_boot_matches_absent() { + let logctx = LogContext::new( + "mupdate_override_clear_non_boot_matches_absent", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + // Create the non-boot directory, but do not create an override file. + dir.child(&NON_BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + let result = + overrides.clear_override(info.mupdate_uuid, internal_disks); + + // The boot disk should be cleared. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::Cleared(info)) + ); + + // The non-boot disk should remain MatchesAbsent with a NoOverride + // result. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::NoOverride, + } + } + ); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with the boot disk being successful, and the + /// non-boot disk being BootPresentOtherAbsent. + #[test] + fn clear_boot_success_non_boot_boot_present_other_absent() { + let logctx = LogContext::new( + "mupdate_override_clear_boot_success_non_boot_boot_present_other_absent", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + // Create non-boot directory but no override file. + dir.child(&NON_BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // First, verify we have the expected mismatch state. + assert_eq!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootPresentOtherAbsent + ) + ); + + let result = + overrides.clear_override(info.mupdate_uuid, internal_disks); + + // The boot disk should be cleared. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::Cleared(info)) + ); + + // The non-boot disk should transition to MatchesAbsent. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::NoOverride, + } + } + ); + + // Verify that the non-boot disk state was updated to MatchesAbsent. + assert_eq!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::MatchesAbsent + ); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with an ID mismatch on the boot disk, and + /// non-boot disk being BootPresentOtherAbsent. + #[test] + fn clear_boot_mismatch_non_boot_boot_present_other_absent() { + let logctx = LogContext::new( + "mupdate_override_clear_boot_mismatch_non_boot_boot_present_other_absent", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + // Create non-boot directory but no override file. + dir.child(&NON_BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // Use a different UUID to cause a mismatch. + let other_uuid = MupdateOverrideUuid::new_v4(); + let result = overrides.clear_override(other_uuid, internal_disks); + + // The boot disk should get an ID mismatch error. + assert_eq!( + result.boot_disk_result, + Err(ClearMupdateOverrideBootError::IdMismatch { + actual: info.mupdate_uuid, + provided: other_uuid, + }) + ); + + // The non-boot disk should remain in the mismatched state. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::NoOverride, + } + } + ); + + // Verify that the non-boot disk state remained unchanged. + assert_eq!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootPresentOtherAbsent + ) + ); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with a BootAbsentOtherPresent mismatch + /// (always clears the non-boot disk). + #[test] + fn clear_non_boot_boot_absent_other_present() { + let logctx = LogContext::new( + "mupdate_override_clear_non_boot_boot_absent_other_present", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + + // Create the boot directory without an override file. + dir.child(&BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + cx.write_to(&dir.child(&NON_BOOT_PATHS.install_dataset)).unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // First, verify we have the expected mismatch state. + assert_eq!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + non_boot_disk_info: info.clone() + } + ) + ); + + let some_uuid = MupdateOverrideUuid::new_v4(); + let result = overrides.clear_override(some_uuid, internal_disks); + + // The boot disk should return NoOverride. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::NoOverride) + ); + + // The non-boot disk should always be cleared in this case, even though + // there's an ID mismatch. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::Cleared { + prev_result: MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + non_boot_disk_info: info + } + ), + }, + } + } + ); + + // Verify that the non-boot file was removed. + assert!(!dir.child(&NON_BOOT_PATHS.mupdate_override_json).exists()); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with the boot disk being successful and the + /// non-boot disk being ValueMismatch. + #[test] + fn clear_boot_success_non_boot_value_mismatch() { + let logctx = LogContext::new( + "mupdate_override_clear_boot_success_non_boot_value_mismatch", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + // Create another context for the non-boot disk. + let cx2 = WriteInstallDatasetContext::new_basic(); + let info2 = cx2.override_info(); + cx2.write_to(&dir.child(&NON_BOOT_PATHS.install_dataset)).unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // First, verify we have the expected mismatch state. + assert_eq!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::ValueMismatch { + non_boot_disk_info: info2.clone() + } + ) + ); + + let result = + overrides.clear_override(info.mupdate_uuid, internal_disks); + + // The boot disk should be cleared. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::Cleared(info)) + ); + + // The non-boot disk should be cleared too. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::Cleared { + prev_result: MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::ValueMismatch { + non_boot_disk_info: info2 + } + ), + }, + } + } + ); + + // Verify that both files were removed. + assert!(!dir.child(&BOOT_PATHS.mupdate_override_json).exists()); + assert!(!dir.child(&NON_BOOT_PATHS.mupdate_override_json).exists()); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with an ID mismatch on the boot disk, and a + /// ValueMismatch on the non-boot disk. + #[test] + fn clear_boot_error_non_boot_value_mismatch() { + let logctx = LogContext::new( + "mupdate_override_clear_boot_error_non_boot_value_mismatch", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + // Create another context for the non-boot disk. + let cx2 = WriteInstallDatasetContext::new_basic(); + cx2.write_to(&dir.child(&NON_BOOT_PATHS.install_dataset)).unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // Use a different UUID to cause a mismatch on the boot disk. + let other_uuid = MupdateOverrideUuid::new_v4(); + let result = overrides.clear_override(other_uuid, internal_disks); + + // The boot disk should get an ID mismatch error. + assert_eq!( + result.boot_disk_result, + Err(ClearMupdateOverrideBootError::IdMismatch { + actual: info.mupdate_uuid, + provided: other_uuid, + }) + ); + + // The non-boot disk should not be altered due to the boot disk + // mismatch. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::BootDiskError, + } + } + ); + + // Verify that both files still exist. + assert!(dir.child(&BOOT_PATHS.mupdate_override_json).exists()); + assert!(dir.child(&NON_BOOT_PATHS.mupdate_override_json).exists()); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with a boot disk read error. + #[test] + fn clear_non_boot_boot_disk_read_error() { + let logctx = LogContext::new( + "mupdate_override_clear_non_boot_boot_disk_read_error", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + + // Create an empty boot file (read error) and a valid non-boot file. + dir.child(&BOOT_PATHS.mupdate_override_json).touch().unwrap(); + cx.write_to(&dir.child(&NON_BOOT_PATHS.install_dataset)).unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // First, verify we have the expected mismatch state. + assert!(matches!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootDiskReadError { .. } + ) + )); + + let other_uuid = MupdateOverrideUuid::new_v4(); + let result = overrides.clear_override(other_uuid, internal_disks); + + // The boot disk should return a read error. + assert!(matches!( + result.boot_disk_result, + Err(ClearMupdateOverrideBootError::ReadError { .. }) + )); + + // The non-boot disk should not be altered due to the boot disk read + // error. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::BootDiskError, + } + } + ); + + // Verify that the non-boot file still exists. + assert!(dir.child(&NON_BOOT_PATHS.mupdate_override_json).exists()); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with a non-boot disk read error. + #[test] + fn clear_non_boot_read_error() { + let logctx = LogContext::new( + "mupdate_override_clear_non_boot_read_error", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + // Create an empty non-boot file (read error). + dir.child(&NON_BOOT_PATHS.mupdate_override_json).touch().unwrap(); + + let internal_disks = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // First, verify we have the expected read error state. + assert!(matches!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::ReadError(_) + )); + + let result = + overrides.clear_override(info.mupdate_uuid, internal_disks); + + // The boot disk should be cleared. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::Cleared(info)) + ); + + // The non-boot disk should not be altered due to a read error on the + // disk. + let expected_error = deserialize_error( + dir.path(), + &NON_BOOT_PATHS.mupdate_override_json, + "", + ) + .into(); + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::ReadError { + path: dir.path().join(&NON_BOOT_PATHS.mupdate_override_json), + error: expected_error, + }, + } + } + ); + + // Verify that the boot file was removed but that the non-boot file + // still exists. + assert!(!dir.child(&BOOT_PATHS.mupdate_override_json).exists()); + assert!(dir.child(&NON_BOOT_PATHS.mupdate_override_json).exists()); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with the non-boot disk missing from + /// InternalDisksWithBootDisk. + #[test] + fn clear_non_boot_disk_missing() { + let logctx = LogContext::new( + "mupdate_override_clear_non_boot_disk_missing", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + cx.write_to(&dir.child(&NON_BOOT_PATHS.install_dataset)).unwrap(); + + // Build the `AllMupdateOverrides` with both disks present. + let internal_disks_with_both = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let mut overrides = AllMupdateOverrides::read_all( + &logctx.log, + &internal_disks_with_both, + ); + + // Clear the override with only the boot disk present. + let internal_disks_boot_only = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[]) + .current_with_boot_disk(); + let result = overrides + .clear_override(info.mupdate_uuid, internal_disks_boot_only); + + // The boot disk should be cleared. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::Cleared(info.clone())) + ); + + // The non-boot disk should return a DiskMissing result. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: Some(dir.path().join(&NON_BOOT_PATHS.mupdate_override_json)), + result: ClearMupdateOverrideNonBootResult::DiskMissing, + } + } + ); + + // Verify that the non-boot disk override was updated accordingly. + assert_eq!( + overrides + .non_boot_disk_overrides + .get(&NON_BOOT_UUID) + .unwrap() + .result, + MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + non_boot_disk_info: info + } + ) + ); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with no status for non-boot disk. + #[test] + fn clear_no_status() { + let logctx = LogContext::new( + "mupdate_override_clear_no_status", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + // Build the `AllMupdateOverrides` with just the boot disk present. + let internal_disks_boot_only = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[]) + .current_with_boot_disk(); + let mut overrides = AllMupdateOverrides::read_all( + &logctx.log, + &internal_disks_boot_only, + ); + + // Clear the override with the non-boot disk now present. + let internal_disks_with_both = + make_internal_disks_rx(dir.path(), BOOT_UUID, &[NON_BOOT_UUID]) + .current_with_boot_disk(); + let result = overrides + .clear_override(info.mupdate_uuid, internal_disks_with_both); + + // The boot disk should be cleared. + assert_eq!( + result.boot_disk_result, + Ok(ClearMupdateOverrideBootSuccess::Cleared(info)) + ); + + // The non-boot disk should return a NoStatus result. + assert_eq!( + result.non_boot_disk_info, + id_ord_map! { + ClearMupdateOverrideNonBootInfo { + zpool_id: NON_BOOT_UUID, + path: None, + result: ClearMupdateOverrideNonBootResult::NoStatus, + } + } + ); + + logctx.cleanup_successful(); + } + + /// Test clearing the override with an IO error removing the file. + #[test] + fn clear_file_io_error() { + let logctx = LogContext::new( + "mupdate_override_clear_file_removal_error", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + let cx = WriteInstallDatasetContext::new_basic(); + let info = cx.override_info(); + cx.write_to(&dir.child(&BOOT_PATHS.install_dataset)).unwrap(); + + let internal_disks = make_internal_disks_rx(dir.path(), BOOT_UUID, &[]) + .current_with_boot_disk(); + let mut overrides = + AllMupdateOverrides::read_all(&logctx.log, &internal_disks); + + // Make the parent directory read-only to prevent removing files from + // inside it. + let mut guard = + ReadOnlyDirDropGuard::new(&dir.child(&BOOT_PATHS.install_dataset)) + .unwrap(); + + let result = + overrides.clear_override(info.mupdate_uuid, internal_disks); + + // The boot disk should return a RemoveError. + assert!(matches!( + result.boot_disk_result, + Err(ClearMupdateOverrideBootError::RemoveError { .. }) + )); + assert_eq!(result.non_boot_disk_info, IdOrdMap::new()); + + // Verify in-memory state was not updated. + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap().as_ref(), + Some(&info) + ); + + guard.finish().unwrap(); + + logctx.cleanup_successful(); + } + + /// A drop guard to help set up read-only permissions for a directory, and + /// to clean them up regardless of whether the test panics. + #[derive(Debug)] + struct ReadOnlyDirDropGuard { + path: Utf8PathBuf, + finished: bool, + } + + impl ReadOnlyDirDropGuard { + fn new(path: &Utf8Path) -> io::Result { + fs::set_permissions(path, fs::Permissions::from_mode(0o555))?; + + Ok(Self { path: path.to_owned(), finished: false }) + } + + fn finish(&mut self) -> io::Result<()> { + if self.finished { + return Ok(()); + } + self.finished = true; + fs::set_permissions(&self.path, fs::Permissions::from_mode(0o755)) + } + } + + impl Drop for ReadOnlyDirDropGuard { + fn drop(&mut self) { + // Avoid a double-panic here -- instead, just log a failure. + if let Err(error) = self.finish() { + eprintln!( + "failed to clean up permissions for {}: {error}", + self.path, + ); + } + } + } } diff --git a/sled-agent/zone-images/src/source_resolver.rs b/sled-agent/zone-images/src/source_resolver.rs index 328973cd0d..fe355ec3b8 100644 --- a/sled-agent/zone-images/src/source_resolver.rs +++ b/sled-agent/zone-images/src/source_resolver.rs @@ -12,8 +12,10 @@ use crate::zone_manifest::AllZoneManifests; use camino::Utf8PathBuf; use illumos_utils::running_zone::ZoneImageFileSource; use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; +use omicron_uuid_kinds::MupdateOverrideUuid; use sled_agent_config_reconciler::InternalDisks; use sled_agent_config_reconciler::InternalDisksWithBootDisk; +use sled_agent_types::zone_images::ClearMupdateOverrideResult; use sled_agent_types::zone_images::MupdateOverrideReadError; use sled_agent_types::zone_images::ResolverStatus; use slog::error; @@ -84,6 +86,20 @@ impl ZoneImageSourceResolver { } } } + + /// Clears out the mupdate override field and files on disk. + pub fn clear_mupdate_override( + &self, + override_id: MupdateOverrideUuid, + internal_disks: InternalDisksWithBootDisk, + ) -> ClearMupdateOverrideResult { + let mut inner = self.inner.lock().unwrap(); + let ret = + inner.mupdate_overrides.clear_override(override_id, internal_disks); + + ret.log_to(&inner.log); + ret + } } #[derive(Debug)]