Skip to content

[wip] [30/n] sled-agent logic to clear mupdate overrides #8572

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: sunshowers/spr/main.wip-30n-sled-agent-logic-to-clear-and-honor-mupdate-overrides
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7429,6 +7429,7 @@ fn inv_collection_print_sleds(collection: &Collection) {
orphaned_datasets,
zones,
boot_partitions,
clear_mupdate_override,
} = last_reconciliation;

inv_print_boot_partition_contents(" ", boot_partitions);
Expand Down
42 changes: 42 additions & 0 deletions nexus-sled-agent-shared/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ pub struct ConfigReconcilerInventory {
pub orphaned_datasets: IdOrdMap<OrphanedDataset>,
pub zones: BTreeMap<OmicronZoneUuid, ConfigReconcilerInventoryResult>,
pub boot_partitions: BootPartitionContents,
/// The result of clearing the mupdate override field.
///
/// `None` if `remove_mupdate_override` was not provided in the sled config.
pub clear_mupdate_override: Option<ClearMupdateOverrideInventory>,
}

impl ConfigReconcilerInventory {
Expand Down Expand Up @@ -198,6 +202,17 @@ impl ConfigReconcilerInventory {
.iter()
.map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
.collect();
let clear_mupdate_override = config.remove_mupdate_override.map(|_| {
ClearMupdateOverrideInventory {
boot_disk_result: Ok(
ClearMupdateOverrideBootSuccess::Cleared,
),
non_boot_message: "mupdate override successfully cleared \
on non-boot disks"
.to_owned(),
}
});

Self {
last_reconciled_config: config,
external_disks,
Expand All @@ -214,6 +229,7 @@ impl ConfigReconcilerInventory {
slot_b: Err(err),
}
},
clear_mupdate_override,
}
}
}
Expand Down Expand Up @@ -275,6 +291,32 @@ impl IdOrdItem for OrphanedDataset {
id_upcast!();
}

/// Status of clearing the mupdate override in the inventory.
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
pub struct ClearMupdateOverrideInventory {
/// The result of clearing the mupdate override on the boot disk.
pub boot_disk_result:
Result<ClearMupdateOverrideBootSuccess, String>,

/// What happened on non-boot disks.
///
/// We aren't modeling this out in more detail, because we plan to not try
/// and keep ledgered data in sync across both disks in the future.
pub non_boot_message: String,
}

/// Status of clearing the mupdate override on the boot disk.
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
pub enum ClearMupdateOverrideBootSuccess {
/// The mupdate override was successfully cleared.
Cleared,

/// No mupdate override was found.
///
/// This is considered a success for idempotency reasons.
NoOverride,
}

#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
#[serde(tag = "result", rename_all = "snake_case")]
pub enum ConfigReconcilerInventoryResult {
Expand Down
102 changes: 102 additions & 0 deletions nexus/db-model/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ use nexus_db_schema::schema::{
};
use nexus_sled_agent_shared::inventory::BootImageHeader;
use nexus_sled_agent_shared::inventory::BootPartitionDetails;
use nexus_sled_agent_shared::inventory::ClearMupdateOverrideBootSuccess;
use nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
use nexus_sled_agent_shared::inventory::HostPhase2DesiredContents;
use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots;
Expand Down Expand Up @@ -955,6 +957,8 @@ pub struct InvSledConfigReconciler {
boot_disk_error: Option<String>,
pub boot_partition_a_error: Option<String>,
pub boot_partition_b_error: Option<String>,
#[diesel(embed)]
pub clear_mupdate_override: InvClearMupdateOverride,
}

impl InvSledConfigReconciler {
Expand All @@ -965,6 +969,7 @@ impl InvSledConfigReconciler {
boot_disk: Result<M2Slot, String>,
boot_partition_a_error: Option<String>,
boot_partition_b_error: Option<String>,
clear_mupdate_override: InvClearMupdateOverride,
) -> Self {
let (boot_disk_slot, boot_disk_error) = match boot_disk {
Ok(M2Slot::A) => (Some(SqlU8(0)), None),
Expand All @@ -980,6 +985,7 @@ impl InvSledConfigReconciler {
boot_disk_error,
boot_partition_a_error,
boot_partition_b_error,
clear_mupdate_override,
}
}

Expand Down Expand Up @@ -1019,6 +1025,102 @@ impl InvSledConfigReconciler {
}
}

// See [`nexus_sled_agent_shared::inventory::DbClearMupdateOverrideBootSuccess`].
impl_enum_type!(
ClearMupdateOverrideBootSuccessEnum:

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)]
pub enum DbClearMupdateOverrideBootSuccess;

// Enum values
Cleared => b"cleared"
NoOverride => b"no-override"
);

impl From<ClearMupdateOverrideBootSuccess>
for DbClearMupdateOverrideBootSuccess
{
fn from(value: ClearMupdateOverrideBootSuccess) -> Self {
match value {
ClearMupdateOverrideBootSuccess::Cleared => Self::Cleared,
ClearMupdateOverrideBootSuccess::NoOverride => Self::NoOverride,
}
}
}

impl From<DbClearMupdateOverrideBootSuccess>
for ClearMupdateOverrideBootSuccess
{
fn from(value: DbClearMupdateOverrideBootSuccess) -> Self {
match value {
DbClearMupdateOverrideBootSuccess::Cleared => Self::Cleared,
DbClearMupdateOverrideBootSuccess::NoOverride => Self::NoOverride,
}
}
}

/// See [`nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory`].
#[derive(Queryable, Clone, Debug, Selectable, Insertable)]
#[diesel(table_name = inv_sled_config_reconciler)]
pub struct InvClearMupdateOverride {
#[diesel(column_name = clear_mupdate_override_boot_success)]
pub boot_success: Option<DbClearMupdateOverrideBootSuccess>,

#[diesel(column_name = clear_mupdate_override_boot_error)]
pub boot_error: Option<String>,

#[diesel(column_name = clear_mupdate_override_non_boot_message)]
pub non_boot_message: Option<String>,
}

impl InvClearMupdateOverride {
pub fn new(
clear_mupdate_override: Option<&ClearMupdateOverrideInventory>,
) -> Self {
let boot_success = clear_mupdate_override.and_then(|inv| {
inv.boot_disk_result.as_ref().ok().map(|v| v.clone().into())
});
let boot_error = clear_mupdate_override
.and_then(|inv| inv.boot_disk_result.as_ref().err().cloned());
let non_boot_message =
clear_mupdate_override.map(|inv| inv.non_boot_message.clone());

Self { boot_success, boot_error, non_boot_message }
}

pub fn into_inventory(
self,
) -> anyhow::Result<Option<ClearMupdateOverrideInventory>> {
match self {
Self {
boot_success: Some(success),
boot_error: None,
non_boot_message: Some(non_boot_message),
} => Ok(Some(ClearMupdateOverrideInventory {
boot_disk_result: Ok(success.into()),
non_boot_message,
})),
Self {
boot_success: None,
boot_error: Some(boot_error),
non_boot_message: Some(non_boot_message),
} => Ok(Some(ClearMupdateOverrideInventory {
boot_disk_result: Err(boot_error),
non_boot_message,
})),
Self {
boot_success: None,
boot_error: None,
non_boot_message: None,
} => Ok(None),
this => Err(anyhow!(
"inv_sled_config_reconciler CHECK constraint violated: \
clear mupdate override columns are not consistent: {this:?}"
)),
}
}
}

/// See [`nexus_sled_agent_shared::inventory::BootPartitionDetails`].
#[derive(Queryable, Clone, Debug, Selectable, Insertable)]
#[diesel(table_name = inv_sled_boot_partition)]
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(164, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(165, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(165, "inv-clear-mupdate-override"),
KnownVersion::new(164, "fix-leaked-bp-oximeter-read-policy-rows"),
KnownVersion::new(163, "bp-desired-host-phase-2"),
KnownVersion::new(162, "bundle-by-creation"),
Expand Down
28 changes: 26 additions & 2 deletions nexus/db-queries/src/db/datastore/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ use iddqd::IdOrdMap;
use nexus_db_errors::ErrorHandler;
use nexus_db_errors::public_error_from_diesel;
use nexus_db_errors::public_error_from_diesel_lookup;
use nexus_db_model::InvCaboose;
use nexus_db_model::InvClickhouseKeeperMembership;
use nexus_db_model::InvCockroachStatus;
use nexus_db_model::InvCollection;
Expand Down Expand Up @@ -69,6 +68,7 @@ use nexus_db_model::{
};
use nexus_db_model::{HwPowerState, InvZoneManifestNonBoot};
use nexus_db_model::{HwRotSlot, InvMupdateOverrideNonBoot};
use nexus_db_model::{InvCaboose, InvClearMupdateOverride};
use nexus_db_schema::enums::HwRotSlotEnum;
use nexus_db_schema::enums::RotImageErrorEnum;
use nexus_db_schema::enums::RotPageWhichEnum;
Expand Down Expand Up @@ -3531,6 +3531,13 @@ impl DataStore {
BootPartitionContents { boot_disk, slot_a, slot_b }
};

let clear_mupdate_override = reconciler
.clear_mupdate_override
.into_inventory()
.map_err(|err| {
Error::internal_error(&format!("{err:#}"))
})?;

Ok::<_, Error>(ConfigReconcilerInventory {
last_reconciled_config,
external_disks: last_reconciliation_disk_results
Expand All @@ -3547,6 +3554,7 @@ impl DataStore {
.remove(&sled_id)
.unwrap_or_default(),
boot_partitions,
clear_mupdate_override,
})
})
.transpose()?;
Expand Down Expand Up @@ -3767,6 +3775,9 @@ impl ConfigReconcilerRows {
)?
};
last_reconciliation_config_id = Some(last_reconciled_config);
let clear_mupdate_override = InvClearMupdateOverride::new(
last_reconciliation.clear_mupdate_override.as_ref(),
);

self.config_reconcilers.push(InvSledConfigReconciler::new(
collection_id,
Expand All @@ -3785,6 +3796,7 @@ impl ConfigReconcilerRows {
.as_ref()
.err()
.cloned(),
clear_mupdate_override,
));

// Boot partition _errors_ are kept in `InvSledConfigReconciler`
Expand Down Expand Up @@ -4033,10 +4045,13 @@ mod test {
use nexus_inventory::examples::Representative;
use nexus_inventory::examples::representative;
use nexus_inventory::now_db_precision;
use nexus_sled_agent_shared::inventory::BootImageHeader;
use nexus_sled_agent_shared::inventory::BootPartitionContents;
use nexus_sled_agent_shared::inventory::BootPartitionDetails;
use nexus_sled_agent_shared::inventory::OrphanedDataset;
use nexus_sled_agent_shared::inventory::{
BootImageHeader, ClearMupdateOverrideBootSuccess,
ClearMupdateOverrideInventory,
};
use nexus_sled_agent_shared::inventory::{
ConfigReconcilerInventory, ConfigReconcilerInventoryResult,
ConfigReconcilerInventoryStatus, OmicronZoneImageSource,
Expand Down Expand Up @@ -4898,6 +4913,15 @@ mod test {
artifact_size: 456789,
}),
},
clear_mupdate_override: Some(
ClearMupdateOverrideInventory {
boot_disk_result: Ok(
ClearMupdateOverrideBootSuccess::Cleared,
),
non_boot_message: "simulated non-boot message"
.to_owned(),
},
),
}
});

Expand Down
1 change: 1 addition & 0 deletions nexus/db-schema/src/enums.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ define_enums! {
BpZoneDispositionEnum => "bp_zone_disposition",
BpZoneImageSourceEnum => "bp_zone_image_source",
CabooseWhichEnum => "caboose_which",
ClearMupdateOverrideBootSuccessEnum => "clear_mupdate_override_boot_success",
ClickhouseModeEnum => "clickhouse_mode",
DatasetKindEnum => "dataset_kind",
DnsGroupEnum => "dns_group",
Expand Down
4 changes: 4 additions & 0 deletions nexus/db-schema/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1626,6 +1626,10 @@ table! {

boot_partition_a_error -> Nullable<Text>,
boot_partition_b_error -> Nullable<Text>,

clear_mupdate_override_boot_success -> Nullable<crate::enums::ClearMupdateOverrideBootSuccessEnum>,
clear_mupdate_override_boot_error -> Nullable<Text>,
clear_mupdate_override_non_boot_message -> Nullable<Text>,
}
}

Expand Down
Loading
Loading