From 86d430d3952eb6ac6ebeab28aff0c13e15101dd5 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 15 Jul 2025 14:09:59 -0400 Subject: [PATCH 01/13] add inventory db support for host phase 1 flash hashes --- nexus/db-model/src/inventory.rs | 46 +++++- .../db-queries/src/db/datastore/inventory.rs | 153 ++++++++++++++++++ nexus/db-schema/src/enums.rs | 1 + nexus/db-schema/src/schema.rs | 12 ++ nexus/inventory/src/builder.rs | 74 +++++++++ nexus/inventory/src/examples.rs | 29 ++++ nexus/types/src/inventory.rs | 22 +++ schema/crdb/dbinit.sql | 29 ++++ 8 files changed, 365 insertions(+), 1 deletion(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 7c1ee230de..7d11f52838 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -30,7 +30,7 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, - inv_last_reconciliation_dataset_result, + inv_host_phase_1_flash_hash, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_mupdate_override_non_boot, @@ -158,6 +158,36 @@ impl From for RotSlot { } } +// See [`M2Slot`]. +impl_enum_type!( + HwHostPhase1SlotEnum: + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + pub enum HwHostPhase1Slot; + + // Enum values + A => b"A" + B => b"B" +); + +impl From for M2Slot { + fn from(value: HwHostPhase1Slot) -> Self { + match value { + HwHostPhase1Slot::A => Self::A, + HwHostPhase1Slot::B => Self::B, + } + } +} + +impl From for HwHostPhase1Slot { + fn from(value: M2Slot) -> Self { + match value { + M2Slot::A => Self::A, + M2Slot::B => Self::B, + } + } +} + // See [`nexus_types::inventory::CabooseWhich`]. impl_enum_type!( CabooseWhichEnum: @@ -752,6 +782,19 @@ impl From for nexus_types::inventory::RotState { } } +/// See [`nexus_types::inventory::HostPhase1FlashHash`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_host_phase_1_flash_hash)] +pub struct InvHostPhase1FlashHash { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub slot: HwHostPhase1Slot, + pub hash: ArtifactHash, +} + /// See [`nexus_types::inventory::CabooseFound`]. #[derive(Queryable, Clone, Debug, Selectable)] #[diesel(table_name = inv_caboose)] @@ -966,6 +1009,7 @@ impl InvSledConfigReconciler { boot_partition_a_error: Option, boot_partition_b_error: Option, ) -> Self { + // TODO-john replace this column with the hw_host_phase_1_slot enum? let (boot_disk_slot, boot_disk_error) = match boot_disk { Ok(M2Slot::A) => (Some(SqlU8(0)), None), Ok(M2Slot::B) => (Some(SqlU8(1)), None), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 307ada9b48..c9736727ea 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -30,6 +30,8 @@ use iddqd::IdOrdMap; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; +use nexus_db_model::ArtifactHash; +use nexus_db_model::HwHostPhase1Slot; use nexus_db_model::InvCaboose; use nexus_db_model::InvClickhouseKeeperMembership; use nexus_db_model::InvCockroachStatus; @@ -38,6 +40,7 @@ use nexus_db_model::InvCollectionError; use nexus_db_model::InvConfigReconcilerStatus; use nexus_db_model::InvConfigReconcilerStatusKind; use nexus_db_model::InvDataset; +use nexus_db_model::InvHostPhase1FlashHash; use nexus_db_model::InvLastReconciliationDatasetResult; use nexus_db_model::InvLastReconciliationDiskResult; use nexus_db_model::InvLastReconciliationOrphanedDataset; @@ -69,6 +72,7 @@ use nexus_db_model::{ }; use nexus_db_model::{HwPowerState, InvZoneManifestNonBoot}; use nexus_db_model::{HwRotSlot, InvMupdateOverrideNonBoot}; +use nexus_db_schema::enums::HwHostPhase1SlotEnum; use nexus_db_schema::enums::HwRotSlotEnum; use nexus_db_schema::enums::RotImageErrorEnum; use nexus_db_schema::enums::RotPageWhichEnum; @@ -668,6 +672,76 @@ impl DataStore { } } + // Insert rows for the host phase 1 flash hashes that we found. + // Like service processors, we do this using INSERT INTO ... SELECT. + { + use nexus_db_schema::schema::hw_baseboard_id::dsl as baseboard_dsl; + use nexus_db_schema::schema::inv_host_phase_1_flash_hash::dsl as phase1_dsl; + + // Squish our map-of-maps down to a flat iterator. + // + // We can throw away the `_slot` key because the `phase1` + // structures also contain their own slot. (Maybe we could use + // `iddqd` here instead?) + let phase1_hashes = collection + .host_phase_1_flash_hashes + .iter() + .flat_map(|(_slot, by_baseboard)| by_baseboard.iter()); + + for (baseboard_id, phase1) in phase1_hashes { + let selection = nexus_db_schema::schema::hw_baseboard_id::table + .select(( + db_collection_id + .into_sql::(), + baseboard_dsl::id, + phase1.time_collected + .into_sql::(), + phase1.source + .clone() + .into_sql::(), + HwHostPhase1Slot::from(phase1.slot) + .into_sql::(), + ArtifactHash(phase1.hash) + .into_sql::(), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + nexus_db_schema::schema::inv_host_phase_1_flash_hash::table, + ) + .values(selection) + .into_columns(( + phase1_dsl::inv_collection_id, + phase1_dsl::hw_baseboard_id, + phase1_dsl::time_collected, + phase1_dsl::source, + phase1_dsl::slot, + phase1_dsl::hash, + )) + .execute_async(&conn) + .await?; + + // See the comment in the above block (where we use + // `inv_service_processor::all_columns()`). The same + // applies here. + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _slot, + _hash, + ) = phase1_dsl::inv_host_phase_1_flash_hash::all_columns(); + } + } + // Insert rows for the cabooses that we found. Like service // processors and roots of trust, we do this using INSERT INTO ... // SELECT. This one's a little more complicated because there are @@ -1689,6 +1763,7 @@ impl DataStore { struct NumRowsDeleted { ncollections: usize, nsps: usize, + nhost_phase1_flash_hashes: usize, nrots: usize, ncabooses: usize, nrot_pages: usize, @@ -1719,6 +1794,7 @@ impl DataStore { let NumRowsDeleted { ncollections, nsps, + nhost_phase1_flash_hashes, nrots, ncabooses, nrot_pages, @@ -1768,6 +1844,16 @@ impl DataStore { .await? }; + // Remove rows for host phase 1 flash hashes. + let nhost_phase1_flash_hashes = { + use nexus_db_schema::schema::inv_host_phase_1_flash_hash::dsl; + diesel::delete(dsl::inv_host_phase_1_flash_hash.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + // Remove rows for roots of trust. let nrots = { use nexus_db_schema::schema::inv_root_of_trust::dsl; @@ -2004,6 +2090,7 @@ impl DataStore { Ok(NumRowsDeleted { ncollections, nsps, + nhost_phase1_flash_hashes, nrots, ncabooses, nrot_pages, @@ -2040,6 +2127,7 @@ impl DataStore { "collection_id" => collection_id.to_string(), "ncollections" => ncollections, "nsps" => nsps, + "nhost_phase1_flash_hashes" => nhost_phase1_flash_hashes, "nrots" => nrots, "ncabooses" => ncabooses, "nrot_pages" => nrot_pages, @@ -2544,6 +2632,70 @@ impl DataStore { }) .collect::, _>>()?; + // Fetch records of host phase 1 flash hashes found. + let inv_host_phase_1_flash_hash_rows = { + use nexus_db_schema::schema::inv_host_phase_1_flash_hash::dsl; + + let mut phase_1s = Vec::new(); + + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let mut batch = paginated_multicolumn( + dsl::inv_host_phase_1_flash_hash, + (dsl::hw_baseboard_id, dsl::slot), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvHostPhase1FlashHash::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| { + (row.hw_baseboard_id, row.slot) + }); + phase_1s.append(&mut batch); + } + + phase_1s + }; + // Assemble the lists of host phase 1 flash hashes found. + let mut host_phase_1_flash_hashes = BTreeMap::new(); + for p in inv_host_phase_1_flash_hash_rows { + let slot = M2Slot::from(p.slot); + let by_baseboard = host_phase_1_flash_hashes + .entry(slot) + .or_insert_with(BTreeMap::new); + let Some(bb) = baseboards_by_id.get(&p.hw_baseboard_id) else { + let msg = format!( + "unknown baseboard found in \ + inv_host_phase_1_flash_hash: {}", + p.hw_baseboard_id + ); + return Err(Error::internal_error(&msg)); + }; + + let previous = by_baseboard.insert( + bb.clone(), + nexus_types::inventory::HostPhase1FlashHash { + time_collected: p.time_collected, + source: p.source, + slot, + hash: *p.hash, + }, + ); + bail_unless!( + previous.is_none(), + "duplicate host phase 1 flash hash found: {:?} baseboard {:?}", + p.slot, + p.hw_baseboard_id + ); + } + // Fetch records of cabooses found. let inv_caboose_rows = { use nexus_db_schema::schema::inv_caboose::dsl; @@ -3675,6 +3827,7 @@ impl DataStore { cabooses: cabooses_by_id.values().cloned().collect(), rot_pages: rot_pages_by_id.values().cloned().collect(), sps, + host_phase_1_flash_hashes, rots, cabooses_found, rot_pages_found, diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 2ee2f3ff6a..dfe7f0210e 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -39,6 +39,7 @@ define_enums! { DownstairsClientStopRequestReasonEnum => "downstairs_client_stop_request_reason_type", DownstairsClientStoppedReasonEnum => "downstairs_client_stopped_reason_type", FailureDomainEnum => "failure_domain", + HwHostPhase1SlotEnum => "hw_host_phase_1_slot", HwPowerStateEnum => "hw_power_state", HwRotSlotEnum => "hw_rot_slot", IdentityProviderTypeEnum => "provider_type", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 16c116c8e4..2b1b108864 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1557,6 +1557,18 @@ table! { } } +table! { + inv_host_phase_1_flash_hash (inv_collection_id, hw_baseboard_id, slot) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + slot -> crate::enums::HwHostPhase1SlotEnum, + hash -> Text, + } +} + table! { inv_caboose (inv_collection_id, hw_baseboard_id, which) { inv_collection_id -> Uuid, diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index d9fe84ca7a..84af3322bf 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -25,6 +25,7 @@ use nexus_types::inventory::CabooseFound; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::CockroachStatus; use nexus_types::inventory::Collection; +use nexus_types::inventory::HostPhase1FlashHash; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageFound; use nexus_types::inventory::RotPageWhich; @@ -35,12 +36,14 @@ use nexus_types::inventory::Zpool; use omicron_cockroach_metrics::CockroachMetric; use omicron_cockroach_metrics::NodeId; use omicron_cockroach_metrics::PrometheusMetrics; +use omicron_common::disk::M2Slot; use omicron_uuid_kinds::CollectionKind; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::hash::Hash; use std::sync::Arc; use thiserror::Error; +use tufaceous_artifact::ArtifactHash; use typed_rng::TypedUuidRng; /// Describes an operational error encountered during the collection process @@ -111,6 +114,8 @@ pub struct CollectionBuilder { cabooses: BTreeSet>, rot_pages: BTreeSet>, sps: BTreeMap, ServiceProcessor>, + host_phase_1_flash_hashes: + BTreeMap, HostPhase1FlashHash>>, rots: BTreeMap, RotState>, cabooses_found: BTreeMap, CabooseFound>>, @@ -144,6 +149,7 @@ impl CollectionBuilder { cabooses: BTreeSet::new(), rot_pages: BTreeSet::new(), sps: BTreeMap::new(), + host_phase_1_flash_hashes: BTreeMap::new(), rots: BTreeMap::new(), cabooses_found: BTreeMap::new(), rot_pages_found: BTreeMap::new(), @@ -166,6 +172,7 @@ impl CollectionBuilder { cabooses: self.cabooses, rot_pages: self.rot_pages, sps: self.sps, + host_phase_1_flash_hashes: self.host_phase_1_flash_hashes, rots: self.rots, cabooses_found: self.cabooses_found, rot_pages_found: self.rot_pages_found, @@ -303,6 +310,73 @@ impl CollectionBuilder { Some(baseboard) } + /// Returns true if we already found the host phase 1 flash hash for `slot` + /// for baseboard `baseboard` + /// + /// This is used to avoid requesting it multiple times (from multiple MGS + /// instances). + pub fn found_host_phase_1_flash_hash_already( + &self, + baseboard: &BaseboardId, + slot: M2Slot, + ) -> bool { + self.host_phase_1_flash_hashes + .get(&slot) + .map(|map| map.contains_key(baseboard)) + .unwrap_or(false) + } + + /// Record the given host phase 1 flash hash found for the given baseboard + /// + /// The baseboard must previously have been reported using + /// `found_sp_state()`. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_host_phase_1_flash_hash( + &mut self, + baseboard: &BaseboardId, + slot: M2Slot, + source: &str, + hash: ArtifactHash, + ) -> Result<(), CollectorBug> { + let (baseboard, _) = + self.sps.get_key_value(baseboard).ok_or_else(|| { + anyhow!( + "reporting host phase 1 flash hash for unknown baseboard: \ + {baseboard:?} ({slot:?}: {hash})", + ) + })?; + let by_id = self + .host_phase_1_flash_hashes + .entry(slot) + .or_insert_with(BTreeMap::new); + if let Some(previous) = by_id.insert( + baseboard.clone(), + HostPhase1FlashHash { + time_collected: now_db_precision(), + source: source.to_owned(), + slot, + hash, + }, + ) { + let error = if previous.hash == hash { + anyhow!("reported multiple times (same value)") + } else { + anyhow!( + "reported host phase 1 flash hash \ + (previously {}, now {hash})", + previous.hash, + ) + }; + Err(CollectorBug::from( + error.context(format!("baseboard {baseboard:?} slot {slot:?}")), + )) + } else { + Ok(()) + } + } + /// Returns true if we already found the caboose for `which` for baseboard /// `baseboard` /// diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index a1c71f7fc4..26cfe12c03 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -216,6 +216,35 @@ pub fn representative() -> Representative { ) .unwrap(); + // Report some phase 1 hashes. + // + // We'll report hashes for both slots for sled 1, only a hash for slot B on + // sled 2, and no hashes for sled 3. + builder + .found_host_phase_1_flash_hash( + &sled1_bb, + M2Slot::A, + "fake MGS 1", + ArtifactHash([1; 32]), + ) + .unwrap(); + builder + .found_host_phase_1_flash_hash( + &sled1_bb, + M2Slot::B, + "fake MGS 1", + ArtifactHash([2; 32]), + ) + .unwrap(); + builder + .found_host_phase_1_flash_hash( + &sled2_bb, + M2Slot::B, + "fake MGS 1", + ArtifactHash([3; 32]), + ) + .unwrap(); + // Report some cabooses. // We'll use the same cabooses for most of these components, although diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index c71136a00f..365a2b3708 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -36,6 +36,7 @@ use omicron_common::api::external::ByteCount; pub use omicron_common::api::internal::shared::NetworkInterface; pub use omicron_common::api::internal::shared::NetworkInterfaceKind; pub use omicron_common::api::internal::shared::SourceNatConfig; +use omicron_common::disk::M2Slot; pub use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::DatasetUuid; @@ -49,6 +50,7 @@ use std::collections::BTreeSet; use std::net::SocketAddrV6; use std::sync::Arc; use strum::EnumIter; +use tufaceous_artifact::ArtifactHash; /// Results of collecting hardware/software inventory from various Omicron /// components @@ -99,6 +101,14 @@ pub struct Collection { /// table. #[serde_as(as = "Vec<(_, _)>")] pub sps: BTreeMap, ServiceProcessor>, + /// all host phase 1 flash hashes, keyed first by the phase 1 slot, then the + /// baseboard id of the sled where they were found + /// + /// In practice, these will be inserted into the + /// `inv_host_phase_1_flash_hash` table. + #[serde_as(as = "BTreeMap<_, Vec<(_, _)>>")] + pub host_phase_1_flash_hashes: + BTreeMap, HostPhase1FlashHash>>, /// all roots of trust, keyed by baseboard id /// /// In practice, these will be inserted into the `inv_root_of_trust` table. @@ -373,6 +383,18 @@ pub struct RotState { pub stage0next_error: Option, } +/// Describes a host phase 1 flash hash found from a service processor +/// during collection +#[derive( + Clone, Debug, Ord, Eq, PartialOrd, PartialEq, Deserialize, Serialize, +)] +pub struct HostPhase1FlashHash { + pub time_collected: DateTime, + pub source: String, + pub slot: M2Slot, + pub hash: ArtifactHash, +} + /// Describes which caboose this is (which component, which slot) #[derive( Clone, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9b90c2b8e8..5102f3045f 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3505,6 +3505,35 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( PRIMARY KEY (inv_collection_id, hw_baseboard_id) ); +-- host phase 1 slots +CREATE TYPE IF NOT EXISTS omicron.public.hw_host_phase_1_slot AS ENUM ( + 'A', + 'B' +); + +-- host phase 1 flash hashes found +-- There are usually two rows here for each row in inv_service_processor, but +-- not necessarily (either or both slots' hash collection may fail). +CREATE TABLE IF NOT EXISTS omicron.public.inv_host_phase_1_flash_hash ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + -- phase 1 slot for this hash + slot omicron.public.hw_host_phase_1_slot NOT NULL, + -- the actual hash of the contents + hash STRING(64) NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, slot) +); + CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( 'sp_slot_0', 'sp_slot_1', From e73406b4e7637576b8f36842a3325b5c39d106c3 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 15 Jul 2025 14:39:31 -0400 Subject: [PATCH 02/13] add phase 1 hashes to inventory collector --- Cargo.lock | 1 + clients/gateway-client/Cargo.toml | 1 + clients/gateway-client/src/lib.rs | 101 ++++++++++++++++++++++++++++++ common/src/disk.rs | 1 + nexus/inventory/src/collector.rs | 55 ++++++++++++++++ 5 files changed, 159 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 89a17d53e7..6b558e4f33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3605,6 +3605,7 @@ dependencies = [ "serde_json", "slog", "thiserror 2.0.12", + "tokio", "uuid", ] diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index 7633fa95e3..8617fda5fa 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -23,5 +23,6 @@ serde_json.workspace = true schemars.workspace = true slog.workspace = true thiserror.workspace = true +tokio.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index 8219f7d34b..2599c1dcfd 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -7,6 +7,9 @@ //! Interface for API requests to a Management Gateway Service (MGS) instance pub use gateway_messages::SpComponent; +use std::time::Duration; +use std::time::Instant; +use types::ComponentFirmwareHashStatus; // We specifically want to allow consumers, such as `wicketd`, to embed // inventory datatypes into their own APIs, rather than recreate structs. @@ -97,3 +100,101 @@ impl PartialOrd for crate::types::SpIdentifier { Some(self.cmp(other)) } } + +#[derive(Debug, thiserror::Error)] +pub enum HostPhase1HashError { + #[error("timed out waiting for hash calculation")] + Timeout, + #[error("hash calculation failed (phase1 written while hashing?)")] + ContentsModifiedWhileHashing, + #[error("failed to send request to {kind}")] + RequestError { + kind: &'static str, + #[source] + err: Error, + }, +} + +impl Client { + /// Get the hash of the host phase 1 flash contents in the given slot. + /// + /// This operation is implemented asynchronously on the SP: a client (us) + /// must request the hash be calculated, then poll until the calculation is + /// complete. This method takes care of the "start / poll" operation; the + /// caller must provide a timeout for how long they're willing to wait for + /// the calculation to complete. In practice, we expect this to take a + /// handful of seconds on real hardware. + pub async fn host_phase_1_flash_hash_calculate_with_timeout( + &self, + sp: types::SpIdentifier, + phase1_slot: u16, + timeout: Duration, + ) -> Result<[u8; 32], HostPhase1HashError> { + const SLEEP_BETWEEN_POLLS: Duration = Duration::from_secs(1); + const PHASE1_FLASH: &str = + SpComponent::HOST_CPU_BOOT_FLASH.const_as_str(); + + let need_to_start_hashing = match self + .sp_component_hash_firmware_get( + sp.type_, + sp.slot, + PHASE1_FLASH, + phase1_slot, + ) + .await + .map_err(|err| HostPhase1HashError::RequestError { + kind: "get hash", + err, + })? + .into_inner() + { + ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash), + ComponentFirmwareHashStatus::HashInProgress => false, + ComponentFirmwareHashStatus::HashNotCalculated => true, + }; + + if need_to_start_hashing { + self.sp_component_hash_firmware_start( + sp.type_, + sp.slot, + PHASE1_FLASH, + phase1_slot, + ) + .await + .map_err(|err| HostPhase1HashError::RequestError { + kind: "start hashing", + err, + })?; + } + + let start = Instant::now(); + loop { + tokio::time::sleep(SLEEP_BETWEEN_POLLS).await; + if start.elapsed() > timeout { + return Err(HostPhase1HashError::Timeout); + } + match self + .sp_component_hash_firmware_get( + sp.type_, + sp.slot, + PHASE1_FLASH, + phase1_slot, + ) + .await + .map_err(|err| HostPhase1HashError::RequestError { + kind: "get hash", + err, + })? + .into_inner() + { + ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash), + ComponentFirmwareHashStatus::HashInProgress => continue, + ComponentFirmwareHashStatus::HashNotCalculated => { + return Err( + HostPhase1HashError::ContentsModifiedWhileHashing, + ); + } + } + } + } +} diff --git a/common/src/disk.rs b/common/src/disk.rs index e7ceb41832..12c29876de 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -619,6 +619,7 @@ impl DiskManagementError { Deserialize, Serialize, JsonSchema, + strum::EnumIter, )] pub enum M2Slot { A, diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index eac99c6d04..9ec52bc912 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -16,11 +16,13 @@ use nexus_types::inventory::Collection; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageWhich; use omicron_cockroach_metrics::CockroachClusterAdminClient; +use omicron_common::disk::M2Slot; use slog::Logger; use slog::o; use slog::{debug, error}; use std::time::Duration; use strum::IntoEnumIterator; +use tufaceous_artifact::ArtifactHash; /// connection and request timeout used for Sled Agent HTTP client const SLED_AGENT_TIMEOUT: Duration = Duration::from_secs(60); @@ -176,6 +178,59 @@ impl<'a> Collector<'a> { continue; }; + // For each host phase 1 slot, attempt to collect its hash, if it + // hasn't been collected already. Generally, we'd only get here for + // the first MGS client. Assuming that one succeeds, the other(s) + // will skip this loop. + for slot in M2Slot::iter() { + const PHASE1_HASH_TIMEOUT: Duration = Duration::from_secs(30); + + if in_progress + .found_host_phase_1_flash_hash_already(&baseboard_id, slot) + { + continue; + } + + let phase1_slot = match slot { + M2Slot::A => 0, + M2Slot::B => 1, + }; + + let result = client + .host_phase_1_flash_hash_calculate_with_timeout( + sp, + phase1_slot, + PHASE1_HASH_TIMEOUT, + ) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {sp:?}: phase 1 slot {slot:?}", + client.baseurl(), + ) + }); + let hash = match result { + Err(error) => { + in_progress.found_error(InventoryError::from(error)); + continue; + } + Ok(hash) => hash, + }; + if let Err(error) = in_progress.found_host_phase_1_flash_hash( + &baseboard_id, + slot, + client.baseurl(), + ArtifactHash(hash), + ) { + error!( + log, + "error reporting host phase 1 flash hash: \ + {baseboard_id:?} {slot:?} {:?}: {error:#}", + client.baseurl(), + ); + } + } + // For each kind of caboose that we care about, if it hasn't been // fetched already, fetch it and record it. Generally, we'd only // get here for the first MGS client. Assuming that one succeeds, From ddbb2e900bfade1bdf67e3cbfa8b2becfc6a891f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 16 Jul 2025 14:37:50 -0400 Subject: [PATCH 03/13] shave some time off OMDB test --- sp-sim/src/update.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sp-sim/src/update.rs b/sp-sim/src/update.rs index 032f1491c5..c4990c262e 100644 --- a/sp-sim/src/update.rs +++ b/sp-sim/src/update.rs @@ -30,8 +30,9 @@ use sha3::Sha3_256; use tokio::sync::mpsc; // How long do we take to hash host flash? Real SPs take a handful of seconds; -// we'll pick something similar. -const TIME_TO_HASH_HOST_PHASE_1: Duration = Duration::from_secs(5); +// we'll pick something similar but shorter to avoid slowing down tests too +// much. +const TIME_TO_HASH_HOST_PHASE_1: Duration = Duration::from_millis(1500); pub(crate) struct SimSpUpdate { /// tracks the state of any ongoing simulated update From 8e1feffc2c93963efa0038dd236bf9bd61014946 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 16 Jul 2025 11:12:46 -0400 Subject: [PATCH 04/13] add phase 1 hashes to omdb inventory output --- dev-tools/omdb/src/bin/omdb/db.rs | 26 +++ dev-tools/omdb/tests/successes.out | 236 ++++++++++++++++++++++++ dev-tools/omdb/tests/test_all_output.rs | 47 ++++- nexus/inventory/src/collector.rs | 98 +++++----- nexus/types/src/inventory.rs | 10 + 5 files changed, 364 insertions(+), 53 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 7f320d8ad9..ab1873fecf 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -158,6 +158,7 @@ use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Generation; use omicron_common::api::external::InstanceState; use omicron_common::api::external::MacAddr; +use omicron_common::disk::M2Slot; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::DownstairsRegionUuid; @@ -7213,6 +7214,31 @@ async fn inv_collection_print_devices( println!(""); println!(" found at: {} from {}", sp.time_collected, sp.source); + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct HostPhase1FlashHashRow { + slot: String, + hash: String, + } + + println!(" host phase 1 hashes:"); + let host_phase1_hash_rows: Vec<_> = M2Slot::iter() + .filter_map(|s| { + collection + .host_phase_1_flash_hash_for(s, baseboard_id) + .map(|h| (s, h)) + }) + .map(|(slot, phase1)| HostPhase1FlashHashRow { + slot: format!("{slot:?}"), + hash: phase1.hash.to_string(), + }) + .collect(); + let table = tabled::Table::new(host_phase1_hash_rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", textwrap::indent(&table.to_string(), " ")); + #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] struct CabooseRow<'a> { diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index c6322c090d..847c822dfb 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -87,6 +87,242 @@ stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable note: database schema version matches expected () ============================================= +EXECUTING COMMAND: omdb ["db", "inventory", "collections", "show", "latest"] +termination: Exited(0) +--------------------------------------------- +stdout: +collection: ..................... +collector: ..................... (likely a Nexus instance) +started: +done: +errors: 1 + error 0: No CockroachDB nodes returned metrics + +Sled SimGimlet00 + part number: i86pc + power: A2 + revision: 0 + MGS slot: Sled 0 (cubby 0) + found at: 2025-07-16 18:39:37.773939 UTC from http://[::1]:REDACTED_PORT + host phase 1 hashes: + SLOT HASH + A e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + B e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimlet 0.0.2 ffffffff n/a + SpSlot1 SimGimletSp SimGimlet 0.0.1 fefefefe n/a + RotSlotA SimRot SimGimletRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RotSlotB SimRot SimGimletRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0 SimRotStage0 SimGimletRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0Next SimRotStage0 SimGimletRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RoT pages: + SLOT DATA_BASE64 + Cmpa Z2ltbGV0LWNtcGEAAAAAAAAAAAAAAAAA... + CfpaActive Z2ltbGV0LWNmcGEtYWN0aXZlAAAAAAAA... + CfpaInactive Z2ltbGV0LWNmcGEtaWhY3RpdmUAAAAA... + CfpaScratch Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAA... + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb + +Sled SimGimlet01 + part number: i86pc + power: A2 + revision: 0 + MGS slot: Sled 1 (cubby 1) + found at: 2025-07-16 18:39:37.782787 UTC from http://[::1]:REDACTED_PORT + host phase 1 hashes: + SLOT HASH + A e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + B e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimlet 0.0.2 ffffffff n/a + SpSlot1 SimGimletSp SimGimlet 0.0.1 fefefefe n/a + RotSlotA SimRot SimGimletRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RotSlotB SimRot SimGimletRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0 SimRotStage0 SimGimletRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0Next SimRotStage0 SimGimletRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RoT pages: + SLOT DATA_BASE64 + Cmpa Z2ltbGV0LWNtcGEAAAAAAAAAAAAAAAAA... + CfpaActive Z2ltbGV0LWNmcGEtYWN0aXZlAAAAAAAA... + CfpaInactive Z2ltbGV0LWNmcGEtaWhY3RpdmUAAAAA... + CfpaScratch Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAA... + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb + +Switch SimSidecar0 + part number: FAKE_SIM_SIDECAR + power: A2 + revision: 0 + MGS slot: Switch 0 + found at: 2025-07-16 18:39:37.756192 UTC from http://[::1]:REDACTED_PORT + host phase 1 hashes: + SLOT HASH + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimSidecarSp SimSidecar 0.0.2 ffffffff n/a + SpSlot1 SimSidecarSp SimSidecar 0.0.1 fefefefe n/a + RotSlotA SimRot SimSidecarRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RotSlotB SimRot SimSidecarRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0 SimRotStage0 SimSidecarRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0Next SimRotStage0 SimSidecarRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RoT pages: + SLOT DATA_BASE64 + Cmpa c2lkZWNhci1jbXBhAAAAAAAAAAAAAAAA... + CfpaActive c2lkZWNhci1jZnBhLWFjdGl2ZQAAAAAA... + CfpaInactive c2lkZWNhci1jZnBhLWluYWN0aXZlAAAA... + CfpaScratch c2lkZWNhci1jZnBhLXNjcmF0Y2gAAAAA... + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb + +Switch SimSidecar1 + part number: FAKE_SIM_SIDECAR + power: A2 + revision: 0 + MGS slot: Switch 1 + found at: 2025-07-16 18:39:37.766483 UTC from http://[::1]:REDACTED_PORT + host phase 1 hashes: + SLOT HASH + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimSidecarSp SimSidecar 0.0.2 ffffffff n/a + SpSlot1 SimSidecarSp SimSidecar 0.0.1 fefefefe n/a + RotSlotA SimRot SimSidecarRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RotSlotB SimRot SimSidecarRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0 SimRotStage0 SimSidecarRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + Stage0Next SimRotStage0 SimSidecarRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf + RoT pages: + SLOT DATA_BASE64 + Cmpa c2lkZWNhci1jbXBhAAAAAAAAAAAAAAAA... + CfpaActive c2lkZWNhci1jZnBhLWFjdGl2ZQAAAAAA... + CfpaInactive c2lkZWNhci1jZnBhLWluYWN0aXZlAAAA... + CfpaScratch c2lkZWNhci1jZnBhLXNjcmF0Y2gAAAAA... + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb + +SLED AGENTS + +sled ..................... (role = Scrimlet, serial sim-.....................) + found at: 2025-07-16 18:39:37.824145 UTC from http://[::1]:REDACTED_PORT + address: [::1]:REDACTED_PORT + usable hw threads: 16 + usable memory (GiB): 32 + reservoir (GiB): 16 + +LEDGERED SLED CONFIG + generation: 2 + remove_mupdate_override: None + desired host phase 2 slot a: keep current contents + desired host phase 2 slot b: keep current contents + DISKS: 10 + ID ZPOOL_ID VENDOR MODEL SERIAL + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-13 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-16 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-11 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-19 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-17 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-15 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-10 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-14 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-12 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-18 + dataset config empty + zone config empty + reconciler task status: not yet run + +sled ..................... (role = Scrimlet, serial sim-.....................) + found at: 2025-07-16 18:39:37.840595 UTC from http://[::1]:REDACTED_PORT + address: [::1]:REDACTED_PORT + usable hw threads: 16 + usable memory (GiB): 32 + reservoir (GiB): 16 + datasets: + oxp_...................../crypt/zone/oxz_external_dns_..................... - id: ....................., compression: off + available: 0 B, used: 0 B + reservation: None, quota: None + oxp_...................../crypt/zone/oxz_nexus_..................... - id: ....................., compression: off + available: 0 B, used: 0 B + reservation: None, quota: None + oxp_...................../crypt/zone/oxz_ntp_..................... - id: ....................., compression: off + available: 0 B, used: 0 B + reservation: None, quota: None + oxp_...................../crypt/zone/oxz_crucible_pantry_..................... - id: ....................., compression: off + available: 0 B, used: 0 B + reservation: None, quota: None + oxp_...................../crypt/zone/oxz_internal_dns_..................... - id: ....................., compression: off + available: 0 B, used: 0 B + reservation: None, quota: None + oxp_...................../crypt/zone/oxz_clickhouse_..................... - id: ....................., compression: off + available: 0 B, used: 0 B + reservation: None, quota: None + oxp_...................../crypt/zone/oxz_cockroachdb_..................... - id: ....................., compression: off + available: 0 B, used: 0 B + reservation: None, quota: None + +LEDGERED SLED CONFIG + generation: 2 + remove_mupdate_override: None + desired host phase 2 slot a: keep current contents + desired host phase 2 slot b: keep current contents + DISKS: 10 + ID ZPOOL_ID VENDOR MODEL SERIAL + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-4 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-3 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-8 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-1 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-0 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-6 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-7 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-2 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-5 + ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-9 + DATASETS: 7 + ID NAME COMPRESSION QUOTA RESERVATION + ..................... oxp_...................../crypt/zone/oxz_crucible_pantry_..................... off none none + ..................... oxp_...................../crypt/zone/oxz_external_dns_..................... off none none + ..................... oxp_...................../crypt/zone/oxz_clickhouse_..................... off none none + ..................... oxp_...................../crypt/zone/oxz_cockroachdb_..................... off none none + ..................... oxp_...................../crypt/zone/oxz_ntp_..................... off none none + ..................... oxp_...................../crypt/zone/oxz_internal_dns_..................... off none none + ..................... oxp_...................../crypt/zone/oxz_nexus_..................... off none none + ZONES: 7 + ID KIND IMAGE_SOURCE + ..................... crucible_pantry install-dataset + ..................... nexus install-dataset + ..................... boundary_ntp install-dataset + ..................... clickhouse install-dataset + ..................... external_dns install-dataset + ..................... internal_dns install-dataset + ..................... cockroach_db install-dataset + reconciler task status: not yet run + +KEEPER MEMBERSHIP +No membership retrieved. + +--------------------------------------------- +stderr: +note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable +note: database schema version matches expected () +warning: 1 collection error was reported above +============================================= EXECUTING COMMAND: omdb ["mgs", "inventory"] termination: Exited(0) --------------------------------------------- diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 3fa997275c..de90a9b93a 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -10,12 +10,15 @@ use dropshot::Method; use expectorate::assert_contents; use http::StatusCode; +use nexus_db_queries::context::OpContext; use nexus_test_utils::wait_for_producer; use nexus_test_utils::{OXIMETER_UUID, PRODUCER_UUID}; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::Blueprint; use nexus_types::deployment::SledFilter; use nexus_types::deployment::UnstableReconfiguratorState; +use omicron_common::api::external::SwitchLocation; +use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_test_utils::dev::test_cmds::Redactor; use omicron_test_utils::dev::test_cmds::path_to_executable; use omicron_test_utils::dev::test_cmds::run_command; @@ -23,6 +26,7 @@ use slog_error_chain::InlineErrorChain; use std::fmt::Write; use std::net::IpAddr; use std::path::Path; +use std::time::Duration; use subprocess::Exec; use uuid::Uuid; @@ -131,17 +135,20 @@ async fn test_omdb_usage_errors() { async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { clear_omdb_env(); - let gwtestctx = gateway_test_utils::setup::test_setup( - "test_omdb_success_case", - gateway_messages::SpPort::One, - ) - .await; let cmd_path = path_to_executable(CMD_OMDB); let postgres_url = cptestctx.database.listen_url(); let nexus_internal_url = format!("http://{}/", cptestctx.internal_client.bind_address); - let mgs_url = format!("http://{}/", gwtestctx.client.bind_address); + let mgs_url = format!( + "http://{}/", + cptestctx + .gateway + .get(&SwitchLocation::Switch0) + .expect("nexus_test always sets up MGS on switch 0") + .client + .bind_address + ); let ox_url = format!("http://{}/", cptestctx.oximeter.server_address()); let ox_test_producer = cptestctx.producer.address().ip(); let ch_url = format!("http://{}/", cptestctx.clickhouse.http_address()); @@ -165,6 +172,31 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { ) .await; + // Wait for Nexus to have gathered at least one inventory collection. (We'll + // check below that `reconfigurator export` contains at least one, so have + // to wait until there's one to export.) + { + let datastore = cptestctx.server.server_context().nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + wait_for_condition( + || async { + match datastore.inventory_get_latest_collection(&opctx).await { + Ok(Some(_)) => Ok(()), + Ok(None) => Err(CondCheckError::NotYet), + Err(err) => Err(CondCheckError::Failed(err)), + } + }, + &Duration::from_millis(500), + &Duration::from_secs(60), + ) + .await + .expect("test nexus gathered an inventory collection"); + } + let mut output = String::new(); let invocations: &[&[&str]] = &[ @@ -175,6 +207,7 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["db", "instances"], &["db", "sleds"], &["db", "sleds", "-F", "discretionary"], + &["db", "inventory", "collections", "show", "latest"], &["mgs", "inventory"], &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], @@ -319,8 +352,6 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &ox_url, ox_test_producer, ); - - gwtestctx.teardown().await; } /// Verify that we properly deal with cases where: diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 9ec52bc912..e81fe85a3d 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -10,6 +10,7 @@ use crate::builder::InventoryError; use anyhow::Context; use gateway_client::types::GetCfpaParams; use gateway_client::types::RotCfpaSlot; +use gateway_client::types::SpType; use gateway_messages::SpComponent; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::Collection; @@ -178,56 +179,63 @@ impl<'a> Collector<'a> { continue; }; - // For each host phase 1 slot, attempt to collect its hash, if it - // hasn't been collected already. Generally, we'd only get here for - // the first MGS client. Assuming that one succeeds, the other(s) - // will skip this loop. - for slot in M2Slot::iter() { - const PHASE1_HASH_TIMEOUT: Duration = Duration::from_secs(30); - - if in_progress - .found_host_phase_1_flash_hash_already(&baseboard_id, slot) - { - continue; - } + // For sled SPs, for each host phase 1 slot, attempt to collect its + // hash, if it hasn't been collected already. Generally, we'd only + // get here for the first MGS client. Assuming that one succeeds, + // the other(s) will skip this loop. + if matches!(sp.type_, SpType::Sled) { + for slot in M2Slot::iter() { + const PHASE1_HASH_TIMEOUT: Duration = + Duration::from_secs(30); + + if in_progress.found_host_phase_1_flash_hash_already( + &baseboard_id, + slot, + ) { + continue; + } - let phase1_slot = match slot { - M2Slot::A => 0, - M2Slot::B => 1, - }; + let phase1_slot = match slot { + M2Slot::A => 0, + M2Slot::B => 1, + }; - let result = client - .host_phase_1_flash_hash_calculate_with_timeout( - sp, - phase1_slot, - PHASE1_HASH_TIMEOUT, - ) - .await - .with_context(|| { - format!( - "MGS {:?}: SP {sp:?}: phase 1 slot {slot:?}", + let result = client + .host_phase_1_flash_hash_calculate_with_timeout( + sp, + phase1_slot, + PHASE1_HASH_TIMEOUT, + ) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {sp:?}: phase 1 slot {slot:?}", + client.baseurl(), + ) + }); + let hash = match result { + Err(error) => { + in_progress + .found_error(InventoryError::from(error)); + continue; + } + Ok(hash) => hash, + }; + if let Err(error) = in_progress + .found_host_phase_1_flash_hash( + &baseboard_id, + slot, client.baseurl(), + ArtifactHash(hash), ) - }); - let hash = match result { - Err(error) => { - in_progress.found_error(InventoryError::from(error)); - continue; + { + error!( + log, + "error reporting host phase 1 flash hash: \ + {baseboard_id:?} {slot:?} {:?}: {error:#}", + client.baseurl(), + ); } - Ok(hash) => hash, - }; - if let Err(error) = in_progress.found_host_phase_1_flash_hash( - &baseboard_id, - slot, - client.baseurl(), - ArtifactHash(hash), - ) { - error!( - log, - "error reporting host phase 1 flash hash: \ - {baseboard_id:?} {slot:?} {:?}: {error:#}", - client.baseurl(), - ); } } diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 365a2b3708..4048b5117d 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -166,6 +166,16 @@ pub struct Collection { } impl Collection { + pub fn host_phase_1_flash_hash_for( + &self, + slot: M2Slot, + baseboard_id: &BaseboardId, + ) -> Option<&HostPhase1FlashHash> { + self.host_phase_1_flash_hashes + .get(&slot) + .and_then(|by_bb| by_bb.get(baseboard_id)) + } + pub fn caboose_for( &self, which: CabooseWhich, From f6043feca7e95af511201a6998b1755a4ef15e6a Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 16 Jul 2025 15:20:32 -0400 Subject: [PATCH 05/13] pull inventory expectorate test back out --- dev-tools/omdb/src/bin/omdb/db.rs | 9 +- dev-tools/omdb/tests/successes.out | 236 ------------------------ dev-tools/omdb/tests/test_all_output.rs | 1 - 3 files changed, 7 insertions(+), 239 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index ab1873fecf..178091afd2 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -7212,7 +7212,11 @@ async fn inv_collection_print_devices( print!(" (cubby {})", sp.sp_slot); } println!(""); - println!(" found at: {} from {}", sp.time_collected, sp.source); + println!( + " found at: {} from {}", + sp.time_collected.to_rfc3339_opts(SecondsFormat::Secs, true), + sp.source + ); #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] @@ -7382,7 +7386,8 @@ fn inv_collection_print_sleds(collection: &Collection) { ); println!( " found at: {} from {}", - sled.time_collected, sled.source + sled.time_collected.to_rfc3339_opts(SecondsFormat::Secs, true), + sled.source ); println!(" address: {}", sled.sled_agent_address); println!(" usable hw threads: {}", sled.usable_hardware_threads); diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 847c822dfb..c6322c090d 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -87,242 +87,6 @@ stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable note: database schema version matches expected () ============================================= -EXECUTING COMMAND: omdb ["db", "inventory", "collections", "show", "latest"] -termination: Exited(0) ---------------------------------------------- -stdout: -collection: ..................... -collector: ..................... (likely a Nexus instance) -started: -done: -errors: 1 - error 0: No CockroachDB nodes returned metrics - -Sled SimGimlet00 - part number: i86pc - power: A2 - revision: 0 - MGS slot: Sled 0 (cubby 0) - found at: 2025-07-16 18:39:37.773939 UTC from http://[::1]:REDACTED_PORT - host phase 1 hashes: - SLOT HASH - A e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 - B e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 - cabooses: - SLOT BOARD NAME VERSION GIT_COMMIT SIGN - SpSlot0 SimGimletSp SimGimlet 0.0.2 ffffffff n/a - SpSlot1 SimGimletSp SimGimlet 0.0.1 fefefefe n/a - RotSlotA SimRot SimGimletRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RotSlotB SimRot SimGimletRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0 SimRotStage0 SimGimletRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0Next SimRotStage0 SimGimletRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RoT pages: - SLOT DATA_BASE64 - Cmpa Z2ltbGV0LWNtcGEAAAAAAAAAAAAAAAAA... - CfpaActive Z2ltbGV0LWNmcGEtYWN0aXZlAAAAAAAA... - CfpaInactive Z2ltbGV0LWNmcGEtaWhY3RpdmUAAAAA... - CfpaScratch Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAA... - RoT: active slot: slot A - RoT: persistent boot preference: slot A - RoT: pending persistent boot preference: - - RoT: transient boot preference: - - RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa - RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb - -Sled SimGimlet01 - part number: i86pc - power: A2 - revision: 0 - MGS slot: Sled 1 (cubby 1) - found at: 2025-07-16 18:39:37.782787 UTC from http://[::1]:REDACTED_PORT - host phase 1 hashes: - SLOT HASH - A e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 - B e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 - cabooses: - SLOT BOARD NAME VERSION GIT_COMMIT SIGN - SpSlot0 SimGimletSp SimGimlet 0.0.2 ffffffff n/a - SpSlot1 SimGimletSp SimGimlet 0.0.1 fefefefe n/a - RotSlotA SimRot SimGimletRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RotSlotB SimRot SimGimletRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0 SimRotStage0 SimGimletRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0Next SimRotStage0 SimGimletRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RoT pages: - SLOT DATA_BASE64 - Cmpa Z2ltbGV0LWNtcGEAAAAAAAAAAAAAAAAA... - CfpaActive Z2ltbGV0LWNmcGEtYWN0aXZlAAAAAAAA... - CfpaInactive Z2ltbGV0LWNmcGEtaWhY3RpdmUAAAAA... - CfpaScratch Z2ltbGV0LWNmcGEtc2NyYXRjaAAAAAAA... - RoT: active slot: slot A - RoT: persistent boot preference: slot A - RoT: pending persistent boot preference: - - RoT: transient boot preference: - - RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa - RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb - -Switch SimSidecar0 - part number: FAKE_SIM_SIDECAR - power: A2 - revision: 0 - MGS slot: Switch 0 - found at: 2025-07-16 18:39:37.756192 UTC from http://[::1]:REDACTED_PORT - host phase 1 hashes: - SLOT HASH - cabooses: - SLOT BOARD NAME VERSION GIT_COMMIT SIGN - SpSlot0 SimSidecarSp SimSidecar 0.0.2 ffffffff n/a - SpSlot1 SimSidecarSp SimSidecar 0.0.1 fefefefe n/a - RotSlotA SimRot SimSidecarRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RotSlotB SimRot SimSidecarRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0 SimRotStage0 SimSidecarRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0Next SimRotStage0 SimSidecarRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RoT pages: - SLOT DATA_BASE64 - Cmpa c2lkZWNhci1jbXBhAAAAAAAAAAAAAAAA... - CfpaActive c2lkZWNhci1jZnBhLWFjdGl2ZQAAAAAA... - CfpaInactive c2lkZWNhci1jZnBhLWluYWN0aXZlAAAA... - CfpaScratch c2lkZWNhci1jZnBhLXNjcmF0Y2gAAAAA... - RoT: active slot: slot A - RoT: persistent boot preference: slot A - RoT: pending persistent boot preference: - - RoT: transient boot preference: - - RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa - RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb - -Switch SimSidecar1 - part number: FAKE_SIM_SIDECAR - power: A2 - revision: 0 - MGS slot: Switch 1 - found at: 2025-07-16 18:39:37.766483 UTC from http://[::1]:REDACTED_PORT - host phase 1 hashes: - SLOT HASH - cabooses: - SLOT BOARD NAME VERSION GIT_COMMIT SIGN - SpSlot0 SimSidecarSp SimSidecar 0.0.2 ffffffff n/a - SpSlot1 SimSidecarSp SimSidecar 0.0.1 fefefefe n/a - RotSlotA SimRot SimSidecarRot 0.0.4 eeeeeeee 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RotSlotB SimRot SimSidecarRot 0.0.3 edededed 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0 SimRotStage0 SimSidecarRot 0.0.200 ddddddddd 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - Stage0Next SimRotStage0 SimSidecarRot 0.0.200 dadadadad 11594bb5548a757e918e6fe056e2ad9e084297c9555417a025d8788eacf55daf - RoT pages: - SLOT DATA_BASE64 - Cmpa c2lkZWNhci1jbXBhAAAAAAAAAAAAAAAA... - CfpaActive c2lkZWNhci1jZnBhLWFjdGl2ZQAAAAAA... - CfpaInactive c2lkZWNhci1jZnBhLWluYWN0aXZlAAAA... - CfpaScratch c2lkZWNhci1jZnBhLXNjcmF0Y2gAAAAA... - RoT: active slot: slot A - RoT: persistent boot preference: slot A - RoT: pending persistent boot preference: - - RoT: transient boot preference: - - RoT: slot A SHA3-256: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa - RoT: slot B SHA3-256: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb - -SLED AGENTS - -sled ..................... (role = Scrimlet, serial sim-.....................) - found at: 2025-07-16 18:39:37.824145 UTC from http://[::1]:REDACTED_PORT - address: [::1]:REDACTED_PORT - usable hw threads: 16 - usable memory (GiB): 32 - reservoir (GiB): 16 - -LEDGERED SLED CONFIG - generation: 2 - remove_mupdate_override: None - desired host phase 2 slot a: keep current contents - desired host phase 2 slot b: keep current contents - DISKS: 10 - ID ZPOOL_ID VENDOR MODEL SERIAL - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-13 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-16 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-11 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-19 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-17 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-15 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-10 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-14 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-12 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-18 - dataset config empty - zone config empty - reconciler task status: not yet run - -sled ..................... (role = Scrimlet, serial sim-.....................) - found at: 2025-07-16 18:39:37.840595 UTC from http://[::1]:REDACTED_PORT - address: [::1]:REDACTED_PORT - usable hw threads: 16 - usable memory (GiB): 32 - reservoir (GiB): 16 - datasets: - oxp_...................../crypt/zone/oxz_external_dns_..................... - id: ....................., compression: off - available: 0 B, used: 0 B - reservation: None, quota: None - oxp_...................../crypt/zone/oxz_nexus_..................... - id: ....................., compression: off - available: 0 B, used: 0 B - reservation: None, quota: None - oxp_...................../crypt/zone/oxz_ntp_..................... - id: ....................., compression: off - available: 0 B, used: 0 B - reservation: None, quota: None - oxp_...................../crypt/zone/oxz_crucible_pantry_..................... - id: ....................., compression: off - available: 0 B, used: 0 B - reservation: None, quota: None - oxp_...................../crypt/zone/oxz_internal_dns_..................... - id: ....................., compression: off - available: 0 B, used: 0 B - reservation: None, quota: None - oxp_...................../crypt/zone/oxz_clickhouse_..................... - id: ....................., compression: off - available: 0 B, used: 0 B - reservation: None, quota: None - oxp_...................../crypt/zone/oxz_cockroachdb_..................... - id: ....................., compression: off - available: 0 B, used: 0 B - reservation: None, quota: None - -LEDGERED SLED CONFIG - generation: 2 - remove_mupdate_override: None - desired host phase 2 slot a: keep current contents - desired host phase 2 slot b: keep current contents - DISKS: 10 - ID ZPOOL_ID VENDOR MODEL SERIAL - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-4 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-3 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-8 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-1 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-0 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-6 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-7 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-2 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-5 - ..................... ..................... nexus-tests nexus-test-model nexus-test-disk-9 - DATASETS: 7 - ID NAME COMPRESSION QUOTA RESERVATION - ..................... oxp_...................../crypt/zone/oxz_crucible_pantry_..................... off none none - ..................... oxp_...................../crypt/zone/oxz_external_dns_..................... off none none - ..................... oxp_...................../crypt/zone/oxz_clickhouse_..................... off none none - ..................... oxp_...................../crypt/zone/oxz_cockroachdb_..................... off none none - ..................... oxp_...................../crypt/zone/oxz_ntp_..................... off none none - ..................... oxp_...................../crypt/zone/oxz_internal_dns_..................... off none none - ..................... oxp_...................../crypt/zone/oxz_nexus_..................... off none none - ZONES: 7 - ID KIND IMAGE_SOURCE - ..................... crucible_pantry install-dataset - ..................... nexus install-dataset - ..................... boundary_ntp install-dataset - ..................... clickhouse install-dataset - ..................... external_dns install-dataset - ..................... internal_dns install-dataset - ..................... cockroach_db install-dataset - reconciler task status: not yet run - -KEEPER MEMBERSHIP -No membership retrieved. - ---------------------------------------------- -stderr: -note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected () -warning: 1 collection error was reported above -============================================= EXECUTING COMMAND: omdb ["mgs", "inventory"] termination: Exited(0) --------------------------------------------- diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index de90a9b93a..ba88782e4c 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -207,7 +207,6 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["db", "instances"], &["db", "sleds"], &["db", "sleds", "-F", "discretionary"], - &["db", "inventory", "collections", "show", "latest"], &["mgs", "inventory"], &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], From f3970afb816130afee47c6aa1266d3cac4450054 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 16 Jul 2025 16:24:13 -0400 Subject: [PATCH 06/13] schema migration --- nexus/db-model/src/schema_versions.rs | 3 ++- schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql | 4 ++++ schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql | 9 +++++++++ schema/crdb/dbinit.sql | 2 +- 4 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql create mode 100644 schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 31ee927bd2..50c3a51457 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(164, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(165, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(165, "add-inv-host-phase-1-flash-hash"), KnownVersion::new(164, "fix-leaked-bp-oximeter-read-policy-rows"), KnownVersion::new(163, "bp-desired-host-phase-2"), KnownVersion::new(162, "bundle-by-creation"), diff --git a/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql b/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql new file mode 100644 index 0000000000..903a68c72f --- /dev/null +++ b/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_host_phase_1_slot AS ENUM ( + 'A', + 'B' +); diff --git a/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql b/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql new file mode 100644 index 0000000000..e486e656e1 --- /dev/null +++ b/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_host_phase_1_flash_hash ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + slot omicron.public.hw_host_phase_1_slot NOT NULL, + hash STRING(64) NOT NULL, + PRIMARY KEY (inv_collection_id, hw_baseboard_id, slot) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5102f3045f..59f6e33a33 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6238,7 +6238,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '164.0.0', NULL) + (TRUE, NOW(), NOW(), '165.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 3b19f8bdc073224fd4e3d346bd6da2ae09f281f5 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 17 Jul 2025 12:41:37 -0400 Subject: [PATCH 07/13] move wait_for_at_least_one_inventory_collection to test context --- Cargo.lock | 2 ++ dev-tools/omdb/tests/test_all_output.rs | 26 ++------------- nexus/src/lib.rs | 5 +++ nexus/test-interface/Cargo.toml | 1 + nexus/test-interface/src/lib.rs | 4 +++ nexus/test-utils/Cargo.toml | 1 + nexus/test-utils/src/lib.rs | 43 +++++++++++++++++++++++++ 7 files changed, 59 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6b558e4f33..3ddca35d93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6851,6 +6851,7 @@ version = "0.1.0" dependencies = [ "async-trait", "nexus-config", + "nexus-db-queries", "nexus-sled-agent-shared", "nexus-types", "omicron-common", @@ -6908,6 +6909,7 @@ dependencies = [ "serde_urlencoded", "sled-agent-client", "slog", + "slog-error-chain", "tokio", "tokio-postgres", "tokio-util", diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index ba88782e4c..3acbd9062e 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -10,7 +10,6 @@ use dropshot::Method; use expectorate::assert_contents; use http::StatusCode; -use nexus_db_queries::context::OpContext; use nexus_test_utils::wait_for_producer; use nexus_test_utils::{OXIMETER_UUID, PRODUCER_UUID}; use nexus_test_utils_macros::nexus_test; @@ -18,7 +17,6 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::SledFilter; use nexus_types::deployment::UnstableReconfiguratorState; use omicron_common::api::external::SwitchLocation; -use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_test_utils::dev::test_cmds::Redactor; use omicron_test_utils::dev::test_cmds::path_to_executable; use omicron_test_utils::dev::test_cmds::run_command; @@ -175,27 +173,9 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { // Wait for Nexus to have gathered at least one inventory collection. (We'll // check below that `reconfigurator export` contains at least one, so have // to wait until there's one to export.) - { - let datastore = cptestctx.server.server_context().nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - wait_for_condition( - || async { - match datastore.inventory_get_latest_collection(&opctx).await { - Ok(Some(_)) => Ok(()), - Ok(None) => Err(CondCheckError::NotYet), - Err(err) => Err(CondCheckError::Failed(err)), - } - }, - &Duration::from_millis(500), - &Duration::from_secs(60), - ) - .await - .expect("test nexus gathered an inventory collection"); - } + cptestctx + .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) + .await; let mut output = String::new(); diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index cd6ba1393e..fc32a4824f 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -27,6 +27,7 @@ use external_api::http_entrypoints::external_api; use internal_api::http_entrypoints::internal_api; use nexus_config::NexusConfig; use nexus_db_model::RendezvousDebugDataset; +use nexus_db_queries::db; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; @@ -335,6 +336,10 @@ impl nexus_test_interface::NexusServer for Server { Server::start(internal_server).await.unwrap() } + fn datastore(&self) -> &Arc { + self.apictx.context.nexus.datastore() + } + async fn get_http_server_external_address(&self) -> SocketAddr { self.apictx.context.nexus.get_external_server_address().await.unwrap() } diff --git a/nexus/test-interface/Cargo.toml b/nexus/test-interface/Cargo.toml index 00da4cb6a3..f608645f6b 100644 --- a/nexus/test-interface/Cargo.toml +++ b/nexus/test-interface/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] async-trait.workspace = true nexus-config.workspace = true +nexus-db-queries.workspace = true nexus-sled-agent-shared.workspace = true nexus-types.workspace = true omicron-common.workspace = true diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index b6a38be063..b32491da30 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -33,6 +33,7 @@ use async_trait::async_trait; use nexus_config::NexusConfig; +use nexus_db_queries::db; use nexus_types::deployment::Blueprint; use nexus_types::internal_api::params::{ PhysicalDiskPutRequest, ZpoolPutRequest, @@ -43,6 +44,7 @@ use omicron_common::disk::DatasetKind; use omicron_uuid_kinds::DatasetUuid; use slog::Logger; use std::net::{SocketAddr, SocketAddrV6}; +use std::sync::Arc; #[async_trait] pub trait NexusServer: Send + Sync + 'static { @@ -81,6 +83,8 @@ pub trait NexusServer: Send + Sync + 'static { >, ) -> Self; + fn datastore(&self) -> &Arc; + async fn get_http_server_external_address(&self) -> SocketAddr; async fn get_http_server_techport_address(&self) -> SocketAddr; async fn get_http_server_internal_address(&self) -> SocketAddr; diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index fd01779c74..81ab9be3f9 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -50,6 +50,7 @@ serde_json.workspace = true serde_urlencoded.workspace = true sled-agent-client.workspace = true slog.workspace = true +slog-error-chain.workspace = true tokio.workspace = true tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } tokio-util.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index d92c056cb0..95c6cc7e60 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -33,6 +33,7 @@ use nexus_config::InternalDns; use nexus_config::MgdConfig; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use nexus_config::NexusConfig; +use nexus_db_queries::context::OpContext; use nexus_db_queries::db::pub_test_utils::crdb; use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots; use nexus_sled_agent_shared::inventory::OmicronSledConfig; @@ -79,6 +80,7 @@ use omicron_common::disk::CompressionAlgorithm; use omicron_common::zpool_name::ZpoolName; use omicron_sled_agent::sim; use omicron_test_utils::dev; +use omicron_test_utils::dev::poll; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::DatasetUuid; @@ -95,6 +97,7 @@ use sled_agent_client::types::EarlyNetworkConfig; use sled_agent_client::types::EarlyNetworkConfigBody; use sled_agent_client::types::RackNetworkConfigV2; use slog::{Logger, debug, error, o}; +use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::HashMap; use std::fmt::Debug; @@ -226,6 +229,46 @@ impl ControlPlaneTestContext { format!("*.sys.{}", self.external_dns_zone_name) } + /// Wait until at least one inventory collection has been inserted into the + /// datastore. + /// + /// # Panics + /// + /// Panics if an inventory collection is not found within `timeout`. + pub async fn wait_for_at_least_one_inventory_collection( + &self, + timeout: Duration, + ) { + let datastore = self.server.datastore(); + let opctx = + OpContext::for_tests(self.logctx.log.clone(), datastore.clone()); + + match wait_for_condition( + || async { + match datastore.inventory_get_latest_collection(&opctx).await { + Ok(Some(_)) => Ok(()), + Ok(None) => Err(CondCheckError::NotYet), + Err(err) => Err(CondCheckError::Failed(err)), + } + }, + &Duration::from_millis(500), + &timeout, + ) + .await + { + Ok(()) => (), + Err(poll::Error::TimedOut(elapsed)) => { + panic!("no inventory collection found within {elapsed:?}"); + } + Err(poll::Error::PermanentError(err)) => { + panic!( + "failed waiting for inventory collection: {}", + InlineErrorChain::new(&err) + ); + } + } + } + pub async fn teardown(mut self) { self.server.close().await; self.database.cleanup().await.unwrap(); From cd5b8a24d5fdba8f1561eb4c82af7e867a3b4eeb Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 17 Jul 2025 12:54:15 -0400 Subject: [PATCH 08/13] fix tests that need to wait for an inventory collection --- nexus/tests/integration_tests/rack.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nexus/tests/integration_tests/rack.rs b/nexus/tests/integration_tests/rack.rs index 6465e915fd..9eebe3d213 100644 --- a/nexus/tests/integration_tests/rack.rs +++ b/nexus/tests/integration_tests/rack.rs @@ -23,6 +23,7 @@ use nexus_types::internal_api::params::SledAgentInfo; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; use omicron_uuid_kinds::GenericUuid; +use std::time::Duration; use uuid::Uuid; type ControlPlaneTestContext = @@ -98,6 +99,12 @@ async fn test_rack_initialization(cptestctx: &ControlPlaneTestContext) { #[nexus_test] async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { + // Setup: wait until we've collected an inventory from the system set + // up by `#[nexus_test]. + cptestctx + .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) + .await; + let internal_client = &cptestctx.internal_client; let external_client = &cptestctx.external_client; let list_url = "/v1/system/hardware/sleds-uninitialized"; @@ -159,6 +166,12 @@ async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { #[nexus_test] async fn test_sled_add(cptestctx: &ControlPlaneTestContext) { + // Setup: wait until we've collected an inventory from the system set + // up by `#[nexus_test]. + cptestctx + .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) + .await; + let external_client = &cptestctx.external_client; let list_url = "/v1/system/hardware/sleds-uninitialized"; let mut uninitialized_sleds = From cd8f1e0c05947ba2fc263dcf77c2ea2a3a1dca8f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 18 Jul 2025 11:32:48 -0400 Subject: [PATCH 09/13] reconfigurator-cli test: `inventory-show ... all` --- .../tests/input/cmds-example.txt | 2 +- .../tests/output/cmds-example-stdout | 62 ++++++++++++++++++- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt b/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt index a3cd8aa94b..21f8a27c0f 100644 --- a/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt +++ b/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt @@ -52,7 +52,7 @@ load-example --seed test-basic --nsleds 3 --sled-policy 1:non-provisionable --sl blueprint-list blueprint-show latest -inventory-show latest +inventory-show latest all # Plan a blueprint run -- this will cause zones and disks on the expunged # sled to be expunged. diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 859052fab2..688ec00e69 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1013,13 +1013,73 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9 PENDING MGS-MANAGED UPDATES: 0 -> inventory-show latest +> inventory-show latest all collection: 9e187896-7809-46d0-9210-d75be1b3c4d4 collector: example started: done: errors: 0 +Sled serial0 + part number: model0 + power: A2 + revision: 0 + MGS slot: Sled 0 (cubby 0) + found at: 2025-07-18 15:33:16.074202 UTC from fake MGS 1 + host phase 1 hashes: + SLOT HASH + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a + RoT pages: + SLOT DATA_BASE64 + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: slotAdigest1 + RoT: slot B SHA3-256: slotBdigest1 + +Sled serial1 + part number: model1 + power: A2 + revision: 0 + MGS slot: Sled 1 (cubby 1) + found at: 2025-07-18 15:33:16.074361 UTC from fake MGS 1 + host phase 1 hashes: + SLOT HASH + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a + RoT pages: + SLOT DATA_BASE64 + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: slotAdigest1 + RoT: slot B SHA3-256: slotBdigest1 + +Sled serial2 + part number: model2 + power: A2 + revision: 0 + MGS slot: Sled 2 (cubby 2) + found at: 2025-07-18 15:33:16.074434 UTC from fake MGS 1 + host phase 1 hashes: + SLOT HASH + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a + RoT pages: + SLOT DATA_BASE64 + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: slotAdigest1 + RoT: slot B SHA3-256: slotBdigest1 + SLED AGENTS sled 2eb69596-f081-4e2d-9425-9994926e0832 (role = Gimlet, serial serial1) From 5792473e57b0c9bc1d4a7a3c41b6e1d36f544149 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 18 Jul 2025 11:36:40 -0400 Subject: [PATCH 10/13] add host phase 1 hashes to simulated system --- Cargo.lock | 1 + .../tests/output/cmds-example-stdout | 18 ++++++---- nexus/reconfigurator/planning/src/system.rs | 35 +++++++++++++++++++ nexus/reconfigurator/simulation/Cargo.toml | 1 + nexus/reconfigurator/simulation/src/system.rs | 16 ++++++++- nexus/types/src/inventory/display.rs | 8 ++++- 6 files changed, 71 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a2b3ecdc4c..28414e2f99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6799,6 +6799,7 @@ dependencies = [ "omicron-workspace-hack", "petname", "slog", + "strum 0.27.1", "swrite", "sync-ptr", "thiserror 2.0.12", diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 688ec00e69..3770bcf9d1 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1025,9 +1025,11 @@ Sled serial0 power: A2 revision: 0 MGS slot: Sled 0 (cubby 0) - found at: 2025-07-18 15:33:16.074202 UTC from fake MGS 1 + found at: from fake MGS 1 host phase 1 hashes: - SLOT HASH + SLOT HASH + A 0101010101010101010101010101010101010101010101010101010101010101 + B 0202020202020202020202020202020202020202020202020202020202020202 cabooses: SLOT BOARD NAME VERSION GIT_COMMIT SIGN SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a @@ -1045,9 +1047,11 @@ Sled serial1 power: A2 revision: 0 MGS slot: Sled 1 (cubby 1) - found at: 2025-07-18 15:33:16.074361 UTC from fake MGS 1 + found at: from fake MGS 1 host phase 1 hashes: - SLOT HASH + SLOT HASH + A 0101010101010101010101010101010101010101010101010101010101010101 + B 0202020202020202020202020202020202020202020202020202020202020202 cabooses: SLOT BOARD NAME VERSION GIT_COMMIT SIGN SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a @@ -1065,9 +1069,11 @@ Sled serial2 power: A2 revision: 0 MGS slot: Sled 2 (cubby 2) - found at: 2025-07-18 15:33:16.074434 UTC from fake MGS 1 + found at: from fake MGS 1 host phase 1 hashes: - SLOT HASH + SLOT HASH + A 0101010101010101010101010101010101010101010101010101010101010101 + B 0202020202020202020202020202020202020202020202020202020202020202 cabooses: SLOT BOARD NAME VERSION GIT_COMMIT SIGN SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index fef4bde6a2..0fb3843e29 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -57,6 +57,7 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; use omicron_common::disk::DiskIdentity; use omicron_common::disk::DiskVariant; +use omicron_common::disk::M2Slot; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use omicron_common::policy::NEXUS_REDUNDANCY; use omicron_uuid_kinds::SledUuid; @@ -69,6 +70,7 @@ use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::sync::Arc; use std::time::Duration; +use tufaceous_artifact::ArtifactHash; use tufaceous_artifact::ArtifactVersion; /// Describes an actual or synthetic Oxide rack for planning and testing @@ -596,6 +598,18 @@ impl SystemDescription { part_number: sp_state.model.clone(), serial_number: sp_state.serial_number.clone(), }; + + for (m2_slot, hash) in s.sp_host_phase_1_hash_flash() { + builder + .found_host_phase_1_flash_hash( + &baseboard_id, + m2_slot, + "fake MGS 1", + hash, + ) + .context("recording SP host phase 1 flash hash")?; + } + if let Some(active) = &s.sp_active_caboose() { builder .found_caboose( @@ -805,6 +819,7 @@ pub struct SledHwInventory<'a> { pub baseboard_id: &'a BaseboardId, pub sp: &'a nexus_types::inventory::ServiceProcessor, pub rot: &'a nexus_types::inventory::RotState, + pub sp_host_phase_1_hash_flash: BTreeMap, pub sp_active: Option>, pub sp_inactive: Option>, } @@ -822,6 +837,7 @@ pub struct Sled { policy: SledPolicy, state: SledState, resources: SledResources, + sp_host_phase_1_hash_flash: BTreeMap, sp_active_caboose: Option>, sp_inactive_caboose: Option>, } @@ -972,6 +988,12 @@ impl Sled { policy, state: SledState::Active, resources: SledResources { subnet: sled_subnet, zpools }, + sp_host_phase_1_hash_flash: [ + (M2Slot::A, ArtifactHash([1; 32])), + (M2Slot::B, ArtifactHash([2; 32])), + ] + .into_iter() + .collect(), sp_active_caboose: Some(Arc::new(Self::default_sp_caboose( String::from("0.0.1"), ))), @@ -1006,6 +1028,10 @@ impl Sled { }) .unwrap_or(Baseboard::Unknown); + let sp_host_phase_1_hash_flash = inventory_sp + .as_ref() + .map(|hw| hw.sp_host_phase_1_hash_flash.clone()) + .unwrap_or_default(); let sp_active_caboose = inventory_sp.as_ref().and_then(|hw| hw.sp_active.clone()); let sp_inactive_caboose = @@ -1119,6 +1145,7 @@ impl Sled { policy: sled_policy, state: sled_state, resources: sled_resources, + sp_host_phase_1_hash_flash, sp_active_caboose, sp_inactive_caboose, } @@ -1147,6 +1174,14 @@ impl Sled { self.inventory_sp.as_ref() } + pub fn sp_host_phase_1_hash_flash( + &self, + ) -> impl Iterator + '_ { + self.sp_host_phase_1_hash_flash + .iter() + .map(|(&slot, &hash)| (slot, hash)) + } + fn sled_agent_inventory(&self) -> &Inventory { &self.inventory_sled_agent } diff --git a/nexus/reconfigurator/simulation/Cargo.toml b/nexus/reconfigurator/simulation/Cargo.toml index c2bb4945b7..2e3234c5c1 100644 --- a/nexus/reconfigurator/simulation/Cargo.toml +++ b/nexus/reconfigurator/simulation/Cargo.toml @@ -21,6 +21,7 @@ omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true petname = { workspace = true, default-features = false } slog.workspace = true +strum.workspace = true swrite.workspace = true sync-ptr.workspace = true thiserror.workspace = true diff --git a/nexus/reconfigurator/simulation/src/system.rs b/nexus/reconfigurator/simulation/src/system.rs index 958a77df35..4e9e7a48e8 100644 --- a/nexus/reconfigurator/simulation/src/system.rs +++ b/nexus/reconfigurator/simulation/src/system.rs @@ -19,8 +19,11 @@ use nexus_types::{ internal_api::params::{DnsConfigParams, DnsConfigZone}, inventory::{CabooseWhich, Collection}, }; -use omicron_common::{address::IpRange, api::external::Generation}; +use omicron_common::{ + address::IpRange, api::external::Generation, disk::M2Slot, +}; use omicron_uuid_kinds::{BlueprintUuid, CollectionUuid, SledUuid}; +use strum::IntoEnumIterator as _; use crate::{ LoadSerializedResultBuilder, @@ -769,6 +772,16 @@ impl SimSystemBuilderInner { .and_then(|baseboard_id| { let inv_sp = primary_collection.sps.get(baseboard_id); let inv_rot = primary_collection.rots.get(baseboard_id); + let sp_host_phase_1_hash_flash = M2Slot::iter() + .filter_map(|slot| { + let found = primary_collection + .host_phase_1_flash_hash_for( + slot, + baseboard_id, + )?; + Some((slot, found.hash)) + }) + .collect(); let sp_active = primary_collection .caboose_for(CabooseWhich::SpSlot0, baseboard_id) .map(|c| c.caboose.clone()); @@ -780,6 +793,7 @@ impl SimSystemBuilderInner { baseboard_id: &baseboard_id, sp: inv_sp, rot: inv_rot, + sp_host_phase_1_hash_flash, sp_active, sp_inactive, }) diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 464da9e8f4..7ee98c7db9 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -342,7 +342,13 @@ fn display_devices( write!(f, " (cubby {})", sp.sp_slot)?; } writeln!(f, "")?; - writeln!(f, " found at: {} from {}", sp.time_collected, sp.source)?; + writeln!( + f, + " found at: {} from {}", + sp.time_collected + .to_rfc3339_opts(SecondsFormat::Millis, /* use_z */ true), + sp.source + )?; #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] From e2f233c93ad35f9403baf96853aef8e74afa5e65 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 18 Jul 2025 11:45:36 -0400 Subject: [PATCH 11/13] more comments on MGS polling implementation --- clients/gateway-client/src/lib.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index 2599c1dcfd..4de1d84a22 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -130,6 +130,29 @@ impl Client { phase1_slot: u16, timeout: Duration, ) -> Result<[u8; 32], HostPhase1HashError> { + // The most common cases of calling this function are: + // + // 1. The hash is already calculated; we get it in the first `get` + // operation below and return after a single request to MGS. + // 2. The hash needs to be recalculated; we'll issue a "start hashing" + // request then go into the polling loop. We expect to sit in that + // loop for a handful of seconds. + // + // Given these, we could make this poll duration longer, since we know + // the operation takes a little while. But there are two arguments for + // polling somewhat more frequently: + // + // 1. Timeouts, timeouts, always wrong; if we believe hashing takes (by + // way of example) 7 seconds, so we set the timeout to something + // slightly larger than that (say 10 seconds), if a real device takes + // slightly longer than our timeout, we now wait 20 seconds. + // 2. An uncommon case of calling this function is that our initial + // `get` returns `HashInProgress`; in this case we have no idea how + // long the hashing has already been running, so would not know how + // long to try to wait. + // + // It should be pretty cheap to poll the SP at 1 Hz, so we sidestep both + // of those issues by doing so. const SLEEP_BETWEEN_POLLS: Duration = Duration::from_secs(1); const PHASE1_FLASH: &str = SpComponent::HOST_CPU_BOOT_FLASH.const_as_str(); @@ -154,6 +177,13 @@ impl Client { }; if need_to_start_hashing { + // It's possible multiple Nexus instances race, all see + // `HashNotCalculated` above, then all try to start hashing here. + // The SP will accept the first request and return a + // `HashInProgress` error for subsequent attempts, but MGS does its + // best to make this operation idempotent; in particular, it will + // catch a `HashInProgress` error here and return an HTTP success. + // We'll return any other error. self.sp_component_hash_firmware_start( sp.type_, sp.slot, From bdbd14d22dff9e5f9e611ec3d3dc2e0e3cf6a669 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 18 Jul 2025 14:23:42 -0400 Subject: [PATCH 12/13] HwHostPhase1Slot -> HwM2Slot --- nexus/db-model/src/inventory.rs | 16 ++++++++-------- nexus/db-queries/src/db/datastore/inventory.rs | 8 ++++---- nexus/db-schema/src/enums.rs | 2 +- nexus/db-schema/src/schema.rs | 2 +- .../crdb/add-inv-host-phase-1-flash-hash/up1.sql | 2 +- .../crdb/add-inv-host-phase-1-flash-hash/up2.sql | 2 +- schema/crdb/dbinit.sql | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 7d11f52838..d1ac16d937 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -160,26 +160,26 @@ impl From for RotSlot { // See [`M2Slot`]. impl_enum_type!( - HwHostPhase1SlotEnum: + HwM2SlotEnum: #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] - pub enum HwHostPhase1Slot; + pub enum HwM2Slot; // Enum values A => b"A" B => b"B" ); -impl From for M2Slot { - fn from(value: HwHostPhase1Slot) -> Self { +impl From for M2Slot { + fn from(value: HwM2Slot) -> Self { match value { - HwHostPhase1Slot::A => Self::A, - HwHostPhase1Slot::B => Self::B, + HwM2Slot::A => Self::A, + HwM2Slot::B => Self::B, } } } -impl From for HwHostPhase1Slot { +impl From for HwM2Slot { fn from(value: M2Slot) -> Self { match value { M2Slot::A => Self::A, @@ -791,7 +791,7 @@ pub struct InvHostPhase1FlashHash { pub time_collected: DateTime, pub source: String, - pub slot: HwHostPhase1Slot, + pub slot: HwM2Slot, pub hash: ArtifactHash, } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index c9736727ea..0e2decb930 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -31,7 +31,7 @@ use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; use nexus_db_model::ArtifactHash; -use nexus_db_model::HwHostPhase1Slot; +use nexus_db_model::HwM2Slot; use nexus_db_model::InvCaboose; use nexus_db_model::InvClickhouseKeeperMembership; use nexus_db_model::InvCockroachStatus; @@ -72,7 +72,7 @@ use nexus_db_model::{ }; use nexus_db_model::{HwPowerState, InvZoneManifestNonBoot}; use nexus_db_model::{HwRotSlot, InvMupdateOverrideNonBoot}; -use nexus_db_schema::enums::HwHostPhase1SlotEnum; +use nexus_db_schema::enums::HwM2SlotEnum; use nexus_db_schema::enums::HwRotSlotEnum; use nexus_db_schema::enums::RotImageErrorEnum; use nexus_db_schema::enums::RotPageWhichEnum; @@ -699,8 +699,8 @@ impl DataStore { phase1.source .clone() .into_sql::(), - HwHostPhase1Slot::from(phase1.slot) - .into_sql::(), + HwM2Slot::from(phase1.slot) + .into_sql::(), ArtifactHash(phase1.hash) .into_sql::(), )) diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index dfe7f0210e..00acc0fe0b 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -39,7 +39,7 @@ define_enums! { DownstairsClientStopRequestReasonEnum => "downstairs_client_stop_request_reason_type", DownstairsClientStoppedReasonEnum => "downstairs_client_stopped_reason_type", FailureDomainEnum => "failure_domain", - HwHostPhase1SlotEnum => "hw_host_phase_1_slot", + HwM2SlotEnum => "hw_m2_slot", HwPowerStateEnum => "hw_power_state", HwRotSlotEnum => "hw_rot_slot", IdentityProviderTypeEnum => "provider_type", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 56759803de..25f068d036 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1564,7 +1564,7 @@ table! { time_collected -> Timestamptz, source -> Text, - slot -> crate::enums::HwHostPhase1SlotEnum, + slot -> crate::enums::HwM2SlotEnum, hash -> Text, } } diff --git a/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql b/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql index 903a68c72f..babdce1dd9 100644 --- a/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql +++ b/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql @@ -1,4 +1,4 @@ -CREATE TYPE IF NOT EXISTS omicron.public.hw_host_phase_1_slot AS ENUM ( +CREATE TYPE IF NOT EXISTS omicron.public.hw_m2_slot AS ENUM ( 'A', 'B' ); diff --git a/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql b/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql index e486e656e1..77f1ed1aff 100644 --- a/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql +++ b/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql @@ -3,7 +3,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_host_phase_1_flash_hash ( hw_baseboard_id UUID NOT NULL, time_collected TIMESTAMPTZ NOT NULL, source TEXT NOT NULL, - slot omicron.public.hw_host_phase_1_slot NOT NULL, + slot omicron.public.hw_m2_slot NOT NULL, hash STRING(64) NOT NULL, PRIMARY KEY (inv_collection_id, hw_baseboard_id, slot) ); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 3ebb6c75a7..0be6c90ee0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3506,7 +3506,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( ); -- host phase 1 slots -CREATE TYPE IF NOT EXISTS omicron.public.hw_host_phase_1_slot AS ENUM ( +CREATE TYPE IF NOT EXISTS omicron.public.hw_m2_slot AS ENUM ( 'A', 'B' ); @@ -3527,7 +3527,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_host_phase_1_flash_hash ( source TEXT NOT NULL, -- phase 1 slot for this hash - slot omicron.public.hw_host_phase_1_slot NOT NULL, + slot omicron.public.hw_m2_slot NOT NULL, -- the actual hash of the contents hash STRING(64) NOT NULL, From 515e263f8732626dd37a91d36c2fc3f4163c4c70 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 18 Jul 2025 14:28:12 -0400 Subject: [PATCH 13/13] replace TODO-john with issue link --- nexus/db-model/src/inventory.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index d1ac16d937..b585052750 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1009,7 +1009,8 @@ impl InvSledConfigReconciler { boot_partition_a_error: Option, boot_partition_b_error: Option, ) -> Self { - // TODO-john replace this column with the hw_host_phase_1_slot enum? + // TODO-cleanup We should use `HwM2Slot` instead of integers for this + // column: https://github.com/oxidecomputer/omicron/issues/8642 let (boot_disk_slot, boot_disk_error) = match boot_disk { Ok(M2Slot::A) => (Some(SqlU8(0)), None), Ok(M2Slot::B) => (Some(SqlU8(1)), None),