diff --git a/Cargo.lock b/Cargo.lock index 1b63efeb98e..b9fb85ce24f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3618,6 +3618,7 @@ dependencies = [ "serde_json", "slog", "thiserror 2.0.12", + "tokio", "uuid", ] @@ -6799,6 +6800,7 @@ dependencies = [ "omicron-workspace-hack", "petname", "slog", + "strum 0.27.1", "swrite", "sync-ptr", "thiserror 2.0.12", @@ -6865,6 +6867,7 @@ version = "0.1.0" dependencies = [ "async-trait", "nexus-config", + "nexus-db-queries", "nexus-sled-agent-shared", "nexus-types", "omicron-common", @@ -6922,6 +6925,7 @@ dependencies = [ "serde_urlencoded", "sled-agent-client", "slog", + "slog-error-chain", "tokio", "tokio-postgres", "tokio-util", diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index 7633fa95e38..8617fda5faf 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -23,5 +23,6 @@ serde_json.workspace = true schemars.workspace = true slog.workspace = true thiserror.workspace = true +tokio.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index 8219f7d34b2..4de1d84a22b 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -7,6 +7,9 @@ //! Interface for API requests to a Management Gateway Service (MGS) instance pub use gateway_messages::SpComponent; +use std::time::Duration; +use std::time::Instant; +use types::ComponentFirmwareHashStatus; // We specifically want to allow consumers, such as `wicketd`, to embed // inventory datatypes into their own APIs, rather than recreate structs. @@ -97,3 +100,131 @@ impl PartialOrd for crate::types::SpIdentifier { Some(self.cmp(other)) } } + +#[derive(Debug, thiserror::Error)] +pub enum HostPhase1HashError { + #[error("timed out waiting for hash calculation")] + Timeout, + #[error("hash calculation failed (phase1 written while hashing?)")] + ContentsModifiedWhileHashing, + #[error("failed to send request to {kind}")] + RequestError { + kind: &'static str, + #[source] + err: Error, + }, +} + +impl Client { + /// Get the hash of the host phase 1 flash contents in the given slot. + /// + /// This operation is implemented asynchronously on the SP: a client (us) + /// must request the hash be calculated, then poll until the calculation is + /// complete. This method takes care of the "start / poll" operation; the + /// caller must provide a timeout for how long they're willing to wait for + /// the calculation to complete. In practice, we expect this to take a + /// handful of seconds on real hardware. + pub async fn host_phase_1_flash_hash_calculate_with_timeout( + &self, + sp: types::SpIdentifier, + phase1_slot: u16, + timeout: Duration, + ) -> Result<[u8; 32], HostPhase1HashError> { + // The most common cases of calling this function are: + // + // 1. The hash is already calculated; we get it in the first `get` + // operation below and return after a single request to MGS. + // 2. The hash needs to be recalculated; we'll issue a "start hashing" + // request then go into the polling loop. We expect to sit in that + // loop for a handful of seconds. + // + // Given these, we could make this poll duration longer, since we know + // the operation takes a little while. But there are two arguments for + // polling somewhat more frequently: + // + // 1. Timeouts, timeouts, always wrong; if we believe hashing takes (by + // way of example) 7 seconds, so we set the timeout to something + // slightly larger than that (say 10 seconds), if a real device takes + // slightly longer than our timeout, we now wait 20 seconds. + // 2. An uncommon case of calling this function is that our initial + // `get` returns `HashInProgress`; in this case we have no idea how + // long the hashing has already been running, so would not know how + // long to try to wait. + // + // It should be pretty cheap to poll the SP at 1 Hz, so we sidestep both + // of those issues by doing so. + const SLEEP_BETWEEN_POLLS: Duration = Duration::from_secs(1); + const PHASE1_FLASH: &str = + SpComponent::HOST_CPU_BOOT_FLASH.const_as_str(); + + let need_to_start_hashing = match self + .sp_component_hash_firmware_get( + sp.type_, + sp.slot, + PHASE1_FLASH, + phase1_slot, + ) + .await + .map_err(|err| HostPhase1HashError::RequestError { + kind: "get hash", + err, + })? + .into_inner() + { + ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash), + ComponentFirmwareHashStatus::HashInProgress => false, + ComponentFirmwareHashStatus::HashNotCalculated => true, + }; + + if need_to_start_hashing { + // It's possible multiple Nexus instances race, all see + // `HashNotCalculated` above, then all try to start hashing here. + // The SP will accept the first request and return a + // `HashInProgress` error for subsequent attempts, but MGS does its + // best to make this operation idempotent; in particular, it will + // catch a `HashInProgress` error here and return an HTTP success. + // We'll return any other error. + self.sp_component_hash_firmware_start( + sp.type_, + sp.slot, + PHASE1_FLASH, + phase1_slot, + ) + .await + .map_err(|err| HostPhase1HashError::RequestError { + kind: "start hashing", + err, + })?; + } + + let start = Instant::now(); + loop { + tokio::time::sleep(SLEEP_BETWEEN_POLLS).await; + if start.elapsed() > timeout { + return Err(HostPhase1HashError::Timeout); + } + match self + .sp_component_hash_firmware_get( + sp.type_, + sp.slot, + PHASE1_FLASH, + phase1_slot, + ) + .await + .map_err(|err| HostPhase1HashError::RequestError { + kind: "get hash", + err, + })? + .into_inner() + { + ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash), + ComponentFirmwareHashStatus::HashInProgress => continue, + ComponentFirmwareHashStatus::HashNotCalculated => { + return Err( + HostPhase1HashError::ContentsModifiedWhileHashing, + ); + } + } + } + } +} diff --git a/common/src/disk.rs b/common/src/disk.rs index e7ceb41832a..12c29876de8 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -619,6 +619,7 @@ impl DiskManagementError { Deserialize, Serialize, JsonSchema, + strum::EnumIter, )] pub enum M2Slot { A, diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 3fa997275c0..3acbd9062ea 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -16,6 +16,7 @@ use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::Blueprint; use nexus_types::deployment::SledFilter; use nexus_types::deployment::UnstableReconfiguratorState; +use omicron_common::api::external::SwitchLocation; use omicron_test_utils::dev::test_cmds::Redactor; use omicron_test_utils::dev::test_cmds::path_to_executable; use omicron_test_utils::dev::test_cmds::run_command; @@ -23,6 +24,7 @@ use slog_error_chain::InlineErrorChain; use std::fmt::Write; use std::net::IpAddr; use std::path::Path; +use std::time::Duration; use subprocess::Exec; use uuid::Uuid; @@ -131,17 +133,20 @@ async fn test_omdb_usage_errors() { async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { clear_omdb_env(); - let gwtestctx = gateway_test_utils::setup::test_setup( - "test_omdb_success_case", - gateway_messages::SpPort::One, - ) - .await; let cmd_path = path_to_executable(CMD_OMDB); let postgres_url = cptestctx.database.listen_url(); let nexus_internal_url = format!("http://{}/", cptestctx.internal_client.bind_address); - let mgs_url = format!("http://{}/", gwtestctx.client.bind_address); + let mgs_url = format!( + "http://{}/", + cptestctx + .gateway + .get(&SwitchLocation::Switch0) + .expect("nexus_test always sets up MGS on switch 0") + .client + .bind_address + ); let ox_url = format!("http://{}/", cptestctx.oximeter.server_address()); let ox_test_producer = cptestctx.producer.address().ip(); let ch_url = format!("http://{}/", cptestctx.clickhouse.http_address()); @@ -165,6 +170,13 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { ) .await; + // Wait for Nexus to have gathered at least one inventory collection. (We'll + // check below that `reconfigurator export` contains at least one, so have + // to wait until there's one to export.) + cptestctx + .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) + .await; + let mut output = String::new(); let invocations: &[&[&str]] = &[ @@ -319,8 +331,6 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &ox_url, ox_test_producer, ); - - gwtestctx.teardown().await; } /// Verify that we properly deal with cases where: diff --git a/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt b/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt index a3cd8aa94b6..21f8a27c0f2 100644 --- a/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt +++ b/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt @@ -52,7 +52,7 @@ load-example --seed test-basic --nsleds 3 --sled-policy 1:non-provisionable --sl blueprint-list blueprint-show latest -inventory-show latest +inventory-show latest all # Plan a blueprint run -- this will cause zones and disks on the expunged # sled to be expunged. diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index ef72633eb3a..5fc208d0efd 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1013,13 +1013,79 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9 PENDING MGS-MANAGED UPDATES: 0 -> inventory-show latest +> inventory-show latest all collection: 9e187896-7809-46d0-9210-d75be1b3c4d4 collector: example started: done: errors: 0 +Sled serial0 + part number: model0 + power: A2 + revision: 0 + MGS slot: Sled 0 (cubby 0) + found at: from fake MGS 1 + host phase 1 hashes: + SLOT HASH + A 0101010101010101010101010101010101010101010101010101010101010101 + B 0202020202020202020202020202020202020202020202020202020202020202 + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a + RoT pages: + SLOT DATA_BASE64 + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: slotAdigest1 + RoT: slot B SHA3-256: slotBdigest1 + +Sled serial1 + part number: model1 + power: A2 + revision: 0 + MGS slot: Sled 1 (cubby 1) + found at: from fake MGS 1 + host phase 1 hashes: + SLOT HASH + A 0101010101010101010101010101010101010101010101010101010101010101 + B 0202020202020202020202020202020202020202020202020202020202020202 + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a + RoT pages: + SLOT DATA_BASE64 + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: slotAdigest1 + RoT: slot B SHA3-256: slotBdigest1 + +Sled serial2 + part number: model2 + power: A2 + revision: 0 + MGS slot: Sled 2 (cubby 2) + found at: from fake MGS 1 + host phase 1 hashes: + SLOT HASH + A 0101010101010101010101010101010101010101010101010101010101010101 + B 0202020202020202020202020202020202020202020202020202020202020202 + cabooses: + SLOT BOARD NAME VERSION GIT_COMMIT SIGN + SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a + RoT pages: + SLOT DATA_BASE64 + RoT: active slot: slot A + RoT: persistent boot preference: slot A + RoT: pending persistent boot preference: - + RoT: transient boot preference: - + RoT: slot A SHA3-256: slotAdigest1 + RoT: slot B SHA3-256: slotBdigest1 + SLED AGENTS sled 2eb69596-f081-4e2d-9425-9994926e0832 (role = Gimlet, serial serial1) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 7c1ee230de4..b585052750a 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -30,7 +30,7 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, - inv_last_reconciliation_dataset_result, + inv_host_phase_1_flash_hash, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_mupdate_override_non_boot, @@ -158,6 +158,36 @@ impl From for RotSlot { } } +// See [`M2Slot`]. +impl_enum_type!( + HwM2SlotEnum: + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + pub enum HwM2Slot; + + // Enum values + A => b"A" + B => b"B" +); + +impl From for M2Slot { + fn from(value: HwM2Slot) -> Self { + match value { + HwM2Slot::A => Self::A, + HwM2Slot::B => Self::B, + } + } +} + +impl From for HwM2Slot { + fn from(value: M2Slot) -> Self { + match value { + M2Slot::A => Self::A, + M2Slot::B => Self::B, + } + } +} + // See [`nexus_types::inventory::CabooseWhich`]. impl_enum_type!( CabooseWhichEnum: @@ -752,6 +782,19 @@ impl From for nexus_types::inventory::RotState { } } +/// See [`nexus_types::inventory::HostPhase1FlashHash`]. +#[derive(Queryable, Clone, Debug, Selectable)] +#[diesel(table_name = inv_host_phase_1_flash_hash)] +pub struct InvHostPhase1FlashHash { + pub inv_collection_id: Uuid, + pub hw_baseboard_id: Uuid, + pub time_collected: DateTime, + pub source: String, + + pub slot: HwM2Slot, + pub hash: ArtifactHash, +} + /// See [`nexus_types::inventory::CabooseFound`]. #[derive(Queryable, Clone, Debug, Selectable)] #[diesel(table_name = inv_caboose)] @@ -966,6 +1009,8 @@ impl InvSledConfigReconciler { boot_partition_a_error: Option, boot_partition_b_error: Option, ) -> Self { + // TODO-cleanup We should use `HwM2Slot` instead of integers for this + // column: https://github.com/oxidecomputer/omicron/issues/8642 let (boot_disk_slot, boot_disk_error) = match boot_disk { Ok(M2Slot::A) => (Some(SqlU8(0)), None), Ok(M2Slot::B) => (Some(SqlU8(1)), None), diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 9918f691897..d29ff7c27ff 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(167, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(168, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(168, "add-inv-host-phase-1-flash-hash"), KnownVersion::new(167, "add-pending-mgs-updates-rot"), KnownVersion::new(166, "bundle-user-comment"), KnownVersion::new(165, "route-config-rib-priority"), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 307ada9b488..0e2decb9306 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -30,6 +30,8 @@ use iddqd::IdOrdMap; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; +use nexus_db_model::ArtifactHash; +use nexus_db_model::HwM2Slot; use nexus_db_model::InvCaboose; use nexus_db_model::InvClickhouseKeeperMembership; use nexus_db_model::InvCockroachStatus; @@ -38,6 +40,7 @@ use nexus_db_model::InvCollectionError; use nexus_db_model::InvConfigReconcilerStatus; use nexus_db_model::InvConfigReconcilerStatusKind; use nexus_db_model::InvDataset; +use nexus_db_model::InvHostPhase1FlashHash; use nexus_db_model::InvLastReconciliationDatasetResult; use nexus_db_model::InvLastReconciliationDiskResult; use nexus_db_model::InvLastReconciliationOrphanedDataset; @@ -69,6 +72,7 @@ use nexus_db_model::{ }; use nexus_db_model::{HwPowerState, InvZoneManifestNonBoot}; use nexus_db_model::{HwRotSlot, InvMupdateOverrideNonBoot}; +use nexus_db_schema::enums::HwM2SlotEnum; use nexus_db_schema::enums::HwRotSlotEnum; use nexus_db_schema::enums::RotImageErrorEnum; use nexus_db_schema::enums::RotPageWhichEnum; @@ -668,6 +672,76 @@ impl DataStore { } } + // Insert rows for the host phase 1 flash hashes that we found. + // Like service processors, we do this using INSERT INTO ... SELECT. + { + use nexus_db_schema::schema::hw_baseboard_id::dsl as baseboard_dsl; + use nexus_db_schema::schema::inv_host_phase_1_flash_hash::dsl as phase1_dsl; + + // Squish our map-of-maps down to a flat iterator. + // + // We can throw away the `_slot` key because the `phase1` + // structures also contain their own slot. (Maybe we could use + // `iddqd` here instead?) + let phase1_hashes = collection + .host_phase_1_flash_hashes + .iter() + .flat_map(|(_slot, by_baseboard)| by_baseboard.iter()); + + for (baseboard_id, phase1) in phase1_hashes { + let selection = nexus_db_schema::schema::hw_baseboard_id::table + .select(( + db_collection_id + .into_sql::(), + baseboard_dsl::id, + phase1.time_collected + .into_sql::(), + phase1.source + .clone() + .into_sql::(), + HwM2Slot::from(phase1.slot) + .into_sql::(), + ArtifactHash(phase1.hash) + .into_sql::(), + )) + .filter( + baseboard_dsl::part_number + .eq(baseboard_id.part_number.clone()), + ) + .filter( + baseboard_dsl::serial_number + .eq(baseboard_id.serial_number.clone()), + ); + + let _ = diesel::insert_into( + nexus_db_schema::schema::inv_host_phase_1_flash_hash::table, + ) + .values(selection) + .into_columns(( + phase1_dsl::inv_collection_id, + phase1_dsl::hw_baseboard_id, + phase1_dsl::time_collected, + phase1_dsl::source, + phase1_dsl::slot, + phase1_dsl::hash, + )) + .execute_async(&conn) + .await?; + + // See the comment in the above block (where we use + // `inv_service_processor::all_columns()`). The same + // applies here. + let ( + _inv_collection_id, + _hw_baseboard_id, + _time_collected, + _source, + _slot, + _hash, + ) = phase1_dsl::inv_host_phase_1_flash_hash::all_columns(); + } + } + // Insert rows for the cabooses that we found. Like service // processors and roots of trust, we do this using INSERT INTO ... // SELECT. This one's a little more complicated because there are @@ -1689,6 +1763,7 @@ impl DataStore { struct NumRowsDeleted { ncollections: usize, nsps: usize, + nhost_phase1_flash_hashes: usize, nrots: usize, ncabooses: usize, nrot_pages: usize, @@ -1719,6 +1794,7 @@ impl DataStore { let NumRowsDeleted { ncollections, nsps, + nhost_phase1_flash_hashes, nrots, ncabooses, nrot_pages, @@ -1768,6 +1844,16 @@ impl DataStore { .await? }; + // Remove rows for host phase 1 flash hashes. + let nhost_phase1_flash_hashes = { + use nexus_db_schema::schema::inv_host_phase_1_flash_hash::dsl; + diesel::delete(dsl::inv_host_phase_1_flash_hash.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + // Remove rows for roots of trust. let nrots = { use nexus_db_schema::schema::inv_root_of_trust::dsl; @@ -2004,6 +2090,7 @@ impl DataStore { Ok(NumRowsDeleted { ncollections, nsps, + nhost_phase1_flash_hashes, nrots, ncabooses, nrot_pages, @@ -2040,6 +2127,7 @@ impl DataStore { "collection_id" => collection_id.to_string(), "ncollections" => ncollections, "nsps" => nsps, + "nhost_phase1_flash_hashes" => nhost_phase1_flash_hashes, "nrots" => nrots, "ncabooses" => ncabooses, "nrot_pages" => nrot_pages, @@ -2544,6 +2632,70 @@ impl DataStore { }) .collect::, _>>()?; + // Fetch records of host phase 1 flash hashes found. + let inv_host_phase_1_flash_hash_rows = { + use nexus_db_schema::schema::inv_host_phase_1_flash_hash::dsl; + + let mut phase_1s = Vec::new(); + + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let mut batch = paginated_multicolumn( + dsl::inv_host_phase_1_flash_hash, + (dsl::hw_baseboard_id, dsl::slot), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvHostPhase1FlashHash::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| { + (row.hw_baseboard_id, row.slot) + }); + phase_1s.append(&mut batch); + } + + phase_1s + }; + // Assemble the lists of host phase 1 flash hashes found. + let mut host_phase_1_flash_hashes = BTreeMap::new(); + for p in inv_host_phase_1_flash_hash_rows { + let slot = M2Slot::from(p.slot); + let by_baseboard = host_phase_1_flash_hashes + .entry(slot) + .or_insert_with(BTreeMap::new); + let Some(bb) = baseboards_by_id.get(&p.hw_baseboard_id) else { + let msg = format!( + "unknown baseboard found in \ + inv_host_phase_1_flash_hash: {}", + p.hw_baseboard_id + ); + return Err(Error::internal_error(&msg)); + }; + + let previous = by_baseboard.insert( + bb.clone(), + nexus_types::inventory::HostPhase1FlashHash { + time_collected: p.time_collected, + source: p.source, + slot, + hash: *p.hash, + }, + ); + bail_unless!( + previous.is_none(), + "duplicate host phase 1 flash hash found: {:?} baseboard {:?}", + p.slot, + p.hw_baseboard_id + ); + } + // Fetch records of cabooses found. let inv_caboose_rows = { use nexus_db_schema::schema::inv_caboose::dsl; @@ -3675,6 +3827,7 @@ impl DataStore { cabooses: cabooses_by_id.values().cloned().collect(), rot_pages: rot_pages_by_id.values().cloned().collect(), sps, + host_phase_1_flash_hashes, rots, cabooses_found, rot_pages_found, diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 2ee2f3ff6a0..00acc0fe0b8 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -39,6 +39,7 @@ define_enums! { DownstairsClientStopRequestReasonEnum => "downstairs_client_stop_request_reason_type", DownstairsClientStoppedReasonEnum => "downstairs_client_stopped_reason_type", FailureDomainEnum => "failure_domain", + HwM2SlotEnum => "hw_m2_slot", HwPowerStateEnum => "hw_power_state", HwRotSlotEnum => "hw_rot_slot", IdentityProviderTypeEnum => "provider_type", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index d256b19fd39..15fe3817bef 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1558,6 +1558,18 @@ table! { } } +table! { + inv_host_phase_1_flash_hash (inv_collection_id, hw_baseboard_id, slot) { + inv_collection_id -> Uuid, + hw_baseboard_id -> Uuid, + time_collected -> Timestamptz, + source -> Text, + + slot -> crate::enums::HwM2SlotEnum, + hash -> Text, + } +} + table! { inv_caboose (inv_collection_id, hw_baseboard_id, which) { inv_collection_id -> Uuid, diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index d9fe84ca7a4..84af3322bf6 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -25,6 +25,7 @@ use nexus_types::inventory::CabooseFound; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::CockroachStatus; use nexus_types::inventory::Collection; +use nexus_types::inventory::HostPhase1FlashHash; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageFound; use nexus_types::inventory::RotPageWhich; @@ -35,12 +36,14 @@ use nexus_types::inventory::Zpool; use omicron_cockroach_metrics::CockroachMetric; use omicron_cockroach_metrics::NodeId; use omicron_cockroach_metrics::PrometheusMetrics; +use omicron_common::disk::M2Slot; use omicron_uuid_kinds::CollectionKind; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::hash::Hash; use std::sync::Arc; use thiserror::Error; +use tufaceous_artifact::ArtifactHash; use typed_rng::TypedUuidRng; /// Describes an operational error encountered during the collection process @@ -111,6 +114,8 @@ pub struct CollectionBuilder { cabooses: BTreeSet>, rot_pages: BTreeSet>, sps: BTreeMap, ServiceProcessor>, + host_phase_1_flash_hashes: + BTreeMap, HostPhase1FlashHash>>, rots: BTreeMap, RotState>, cabooses_found: BTreeMap, CabooseFound>>, @@ -144,6 +149,7 @@ impl CollectionBuilder { cabooses: BTreeSet::new(), rot_pages: BTreeSet::new(), sps: BTreeMap::new(), + host_phase_1_flash_hashes: BTreeMap::new(), rots: BTreeMap::new(), cabooses_found: BTreeMap::new(), rot_pages_found: BTreeMap::new(), @@ -166,6 +172,7 @@ impl CollectionBuilder { cabooses: self.cabooses, rot_pages: self.rot_pages, sps: self.sps, + host_phase_1_flash_hashes: self.host_phase_1_flash_hashes, rots: self.rots, cabooses_found: self.cabooses_found, rot_pages_found: self.rot_pages_found, @@ -303,6 +310,73 @@ impl CollectionBuilder { Some(baseboard) } + /// Returns true if we already found the host phase 1 flash hash for `slot` + /// for baseboard `baseboard` + /// + /// This is used to avoid requesting it multiple times (from multiple MGS + /// instances). + pub fn found_host_phase_1_flash_hash_already( + &self, + baseboard: &BaseboardId, + slot: M2Slot, + ) -> bool { + self.host_phase_1_flash_hashes + .get(&slot) + .map(|map| map.contains_key(baseboard)) + .unwrap_or(false) + } + + /// Record the given host phase 1 flash hash found for the given baseboard + /// + /// The baseboard must previously have been reported using + /// `found_sp_state()`. + /// + /// `source` is an arbitrary string for debugging that describes the MGS + /// that reported this data (generally a URL string). + pub fn found_host_phase_1_flash_hash( + &mut self, + baseboard: &BaseboardId, + slot: M2Slot, + source: &str, + hash: ArtifactHash, + ) -> Result<(), CollectorBug> { + let (baseboard, _) = + self.sps.get_key_value(baseboard).ok_or_else(|| { + anyhow!( + "reporting host phase 1 flash hash for unknown baseboard: \ + {baseboard:?} ({slot:?}: {hash})", + ) + })?; + let by_id = self + .host_phase_1_flash_hashes + .entry(slot) + .or_insert_with(BTreeMap::new); + if let Some(previous) = by_id.insert( + baseboard.clone(), + HostPhase1FlashHash { + time_collected: now_db_precision(), + source: source.to_owned(), + slot, + hash, + }, + ) { + let error = if previous.hash == hash { + anyhow!("reported multiple times (same value)") + } else { + anyhow!( + "reported host phase 1 flash hash \ + (previously {}, now {hash})", + previous.hash, + ) + }; + Err(CollectorBug::from( + error.context(format!("baseboard {baseboard:?} slot {slot:?}")), + )) + } else { + Ok(()) + } + } + /// Returns true if we already found the caboose for `which` for baseboard /// `baseboard` /// diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index eac99c6d042..e81fe85a3d8 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -10,17 +10,20 @@ use crate::builder::InventoryError; use anyhow::Context; use gateway_client::types::GetCfpaParams; use gateway_client::types::RotCfpaSlot; +use gateway_client::types::SpType; use gateway_messages::SpComponent; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::Collection; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageWhich; use omicron_cockroach_metrics::CockroachClusterAdminClient; +use omicron_common::disk::M2Slot; use slog::Logger; use slog::o; use slog::{debug, error}; use std::time::Duration; use strum::IntoEnumIterator; +use tufaceous_artifact::ArtifactHash; /// connection and request timeout used for Sled Agent HTTP client const SLED_AGENT_TIMEOUT: Duration = Duration::from_secs(60); @@ -176,6 +179,66 @@ impl<'a> Collector<'a> { continue; }; + // For sled SPs, for each host phase 1 slot, attempt to collect its + // hash, if it hasn't been collected already. Generally, we'd only + // get here for the first MGS client. Assuming that one succeeds, + // the other(s) will skip this loop. + if matches!(sp.type_, SpType::Sled) { + for slot in M2Slot::iter() { + const PHASE1_HASH_TIMEOUT: Duration = + Duration::from_secs(30); + + if in_progress.found_host_phase_1_flash_hash_already( + &baseboard_id, + slot, + ) { + continue; + } + + let phase1_slot = match slot { + M2Slot::A => 0, + M2Slot::B => 1, + }; + + let result = client + .host_phase_1_flash_hash_calculate_with_timeout( + sp, + phase1_slot, + PHASE1_HASH_TIMEOUT, + ) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {sp:?}: phase 1 slot {slot:?}", + client.baseurl(), + ) + }); + let hash = match result { + Err(error) => { + in_progress + .found_error(InventoryError::from(error)); + continue; + } + Ok(hash) => hash, + }; + if let Err(error) = in_progress + .found_host_phase_1_flash_hash( + &baseboard_id, + slot, + client.baseurl(), + ArtifactHash(hash), + ) + { + error!( + log, + "error reporting host phase 1 flash hash: \ + {baseboard_id:?} {slot:?} {:?}: {error:#}", + client.baseurl(), + ); + } + } + } + // For each kind of caboose that we care about, if it hasn't been // fetched already, fetch it and record it. Generally, we'd only // get here for the first MGS client. Assuming that one succeeds, diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index a1c71f7fc44..26cfe12c035 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -216,6 +216,35 @@ pub fn representative() -> Representative { ) .unwrap(); + // Report some phase 1 hashes. + // + // We'll report hashes for both slots for sled 1, only a hash for slot B on + // sled 2, and no hashes for sled 3. + builder + .found_host_phase_1_flash_hash( + &sled1_bb, + M2Slot::A, + "fake MGS 1", + ArtifactHash([1; 32]), + ) + .unwrap(); + builder + .found_host_phase_1_flash_hash( + &sled1_bb, + M2Slot::B, + "fake MGS 1", + ArtifactHash([2; 32]), + ) + .unwrap(); + builder + .found_host_phase_1_flash_hash( + &sled2_bb, + M2Slot::B, + "fake MGS 1", + ArtifactHash([3; 32]), + ) + .unwrap(); + // Report some cabooses. // We'll use the same cabooses for most of these components, although diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index dbae19b3127..57aca38eece 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -58,6 +58,7 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; use omicron_common::disk::DiskIdentity; use omicron_common::disk::DiskVariant; +use omicron_common::disk::M2Slot; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use omicron_common::policy::NEXUS_REDUNDANCY; use omicron_uuid_kinds::MupdateOverrideUuid; @@ -72,6 +73,7 @@ use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::sync::Arc; use std::time::Duration; +use tufaceous_artifact::ArtifactHash; use tufaceous_artifact::ArtifactVersion; /// Describes an actual or synthetic Oxide rack for planning and testing @@ -629,6 +631,18 @@ impl SystemDescription { part_number: sp_state.model.clone(), serial_number: sp_state.serial_number.clone(), }; + + for (m2_slot, hash) in s.sp_host_phase_1_hash_flash() { + builder + .found_host_phase_1_flash_hash( + &baseboard_id, + m2_slot, + "fake MGS 1", + hash, + ) + .context("recording SP host phase 1 flash hash")?; + } + if let Some(active) = &s.sp_active_caboose() { builder .found_caboose( @@ -838,6 +852,7 @@ pub struct SledHwInventory<'a> { pub baseboard_id: &'a BaseboardId, pub sp: &'a nexus_types::inventory::ServiceProcessor, pub rot: &'a nexus_types::inventory::RotState, + pub sp_host_phase_1_hash_flash: BTreeMap, pub sp_active: Option>, pub sp_inactive: Option>, } @@ -855,6 +870,7 @@ pub struct Sled { policy: SledPolicy, state: SledState, resources: SledResources, + sp_host_phase_1_hash_flash: BTreeMap, sp_active_caboose: Option>, sp_inactive_caboose: Option>, } @@ -1005,6 +1021,12 @@ impl Sled { policy, state: SledState::Active, resources: SledResources { subnet: sled_subnet, zpools }, + sp_host_phase_1_hash_flash: [ + (M2Slot::A, ArtifactHash([1; 32])), + (M2Slot::B, ArtifactHash([2; 32])), + ] + .into_iter() + .collect(), sp_active_caboose: Some(Arc::new(Self::default_sp_caboose( String::from("0.0.1"), ))), @@ -1039,6 +1061,10 @@ impl Sled { }) .unwrap_or(Baseboard::Unknown); + let sp_host_phase_1_hash_flash = inventory_sp + .as_ref() + .map(|hw| hw.sp_host_phase_1_hash_flash.clone()) + .unwrap_or_default(); let sp_active_caboose = inventory_sp.as_ref().and_then(|hw| hw.sp_active.clone()); let sp_inactive_caboose = @@ -1152,6 +1178,7 @@ impl Sled { policy: sled_policy, state: sled_state, resources: sled_resources, + sp_host_phase_1_hash_flash, sp_active_caboose, sp_inactive_caboose, } @@ -1180,6 +1207,14 @@ impl Sled { self.inventory_sp.as_ref() } + pub fn sp_host_phase_1_hash_flash( + &self, + ) -> impl Iterator + '_ { + self.sp_host_phase_1_hash_flash + .iter() + .map(|(&slot, &hash)| (slot, hash)) + } + fn sled_agent_inventory(&self) -> &Inventory { &self.inventory_sled_agent } diff --git a/nexus/reconfigurator/simulation/Cargo.toml b/nexus/reconfigurator/simulation/Cargo.toml index c2bb4945b70..2e3234c5c13 100644 --- a/nexus/reconfigurator/simulation/Cargo.toml +++ b/nexus/reconfigurator/simulation/Cargo.toml @@ -21,6 +21,7 @@ omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true petname = { workspace = true, default-features = false } slog.workspace = true +strum.workspace = true swrite.workspace = true sync-ptr.workspace = true thiserror.workspace = true diff --git a/nexus/reconfigurator/simulation/src/system.rs b/nexus/reconfigurator/simulation/src/system.rs index 958a77df351..4e9e7a48e8b 100644 --- a/nexus/reconfigurator/simulation/src/system.rs +++ b/nexus/reconfigurator/simulation/src/system.rs @@ -19,8 +19,11 @@ use nexus_types::{ internal_api::params::{DnsConfigParams, DnsConfigZone}, inventory::{CabooseWhich, Collection}, }; -use omicron_common::{address::IpRange, api::external::Generation}; +use omicron_common::{ + address::IpRange, api::external::Generation, disk::M2Slot, +}; use omicron_uuid_kinds::{BlueprintUuid, CollectionUuid, SledUuid}; +use strum::IntoEnumIterator as _; use crate::{ LoadSerializedResultBuilder, @@ -769,6 +772,16 @@ impl SimSystemBuilderInner { .and_then(|baseboard_id| { let inv_sp = primary_collection.sps.get(baseboard_id); let inv_rot = primary_collection.rots.get(baseboard_id); + let sp_host_phase_1_hash_flash = M2Slot::iter() + .filter_map(|slot| { + let found = primary_collection + .host_phase_1_flash_hash_for( + slot, + baseboard_id, + )?; + Some((slot, found.hash)) + }) + .collect(); let sp_active = primary_collection .caboose_for(CabooseWhich::SpSlot0, baseboard_id) .map(|c| c.caboose.clone()); @@ -780,6 +793,7 @@ impl SimSystemBuilderInner { baseboard_id: &baseboard_id, sp: inv_sp, rot: inv_rot, + sp_host_phase_1_hash_flash, sp_active, sp_inactive, }) diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index cd6ba1393ea..fc32a4824f5 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -27,6 +27,7 @@ use external_api::http_entrypoints::external_api; use internal_api::http_entrypoints::internal_api; use nexus_config::NexusConfig; use nexus_db_model::RendezvousDebugDataset; +use nexus_db_queries::db; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; @@ -335,6 +336,10 @@ impl nexus_test_interface::NexusServer for Server { Server::start(internal_server).await.unwrap() } + fn datastore(&self) -> &Arc { + self.apictx.context.nexus.datastore() + } + async fn get_http_server_external_address(&self) -> SocketAddr { self.apictx.context.nexus.get_external_server_address().await.unwrap() } diff --git a/nexus/test-interface/Cargo.toml b/nexus/test-interface/Cargo.toml index 00da4cb6a3b..f608645f6b8 100644 --- a/nexus/test-interface/Cargo.toml +++ b/nexus/test-interface/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] async-trait.workspace = true nexus-config.workspace = true +nexus-db-queries.workspace = true nexus-sled-agent-shared.workspace = true nexus-types.workspace = true omicron-common.workspace = true diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index b6a38be0631..b32491da301 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -33,6 +33,7 @@ use async_trait::async_trait; use nexus_config::NexusConfig; +use nexus_db_queries::db; use nexus_types::deployment::Blueprint; use nexus_types::internal_api::params::{ PhysicalDiskPutRequest, ZpoolPutRequest, @@ -43,6 +44,7 @@ use omicron_common::disk::DatasetKind; use omicron_uuid_kinds::DatasetUuid; use slog::Logger; use std::net::{SocketAddr, SocketAddrV6}; +use std::sync::Arc; #[async_trait] pub trait NexusServer: Send + Sync + 'static { @@ -81,6 +83,8 @@ pub trait NexusServer: Send + Sync + 'static { >, ) -> Self; + fn datastore(&self) -> &Arc; + async fn get_http_server_external_address(&self) -> SocketAddr; async fn get_http_server_techport_address(&self) -> SocketAddr; async fn get_http_server_internal_address(&self) -> SocketAddr; diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index fd01779c745..81ab9be3f9f 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -50,6 +50,7 @@ serde_json.workspace = true serde_urlencoded.workspace = true sled-agent-client.workspace = true slog.workspace = true +slog-error-chain.workspace = true tokio.workspace = true tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } tokio-util.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index d92c056cb01..95c6cc7e609 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -33,6 +33,7 @@ use nexus_config::InternalDns; use nexus_config::MgdConfig; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use nexus_config::NexusConfig; +use nexus_db_queries::context::OpContext; use nexus_db_queries::db::pub_test_utils::crdb; use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots; use nexus_sled_agent_shared::inventory::OmicronSledConfig; @@ -79,6 +80,7 @@ use omicron_common::disk::CompressionAlgorithm; use omicron_common::zpool_name::ZpoolName; use omicron_sled_agent::sim; use omicron_test_utils::dev; +use omicron_test_utils::dev::poll; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::DatasetUuid; @@ -95,6 +97,7 @@ use sled_agent_client::types::EarlyNetworkConfig; use sled_agent_client::types::EarlyNetworkConfigBody; use sled_agent_client::types::RackNetworkConfigV2; use slog::{Logger, debug, error, o}; +use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::HashMap; use std::fmt::Debug; @@ -226,6 +229,46 @@ impl ControlPlaneTestContext { format!("*.sys.{}", self.external_dns_zone_name) } + /// Wait until at least one inventory collection has been inserted into the + /// datastore. + /// + /// # Panics + /// + /// Panics if an inventory collection is not found within `timeout`. + pub async fn wait_for_at_least_one_inventory_collection( + &self, + timeout: Duration, + ) { + let datastore = self.server.datastore(); + let opctx = + OpContext::for_tests(self.logctx.log.clone(), datastore.clone()); + + match wait_for_condition( + || async { + match datastore.inventory_get_latest_collection(&opctx).await { + Ok(Some(_)) => Ok(()), + Ok(None) => Err(CondCheckError::NotYet), + Err(err) => Err(CondCheckError::Failed(err)), + } + }, + &Duration::from_millis(500), + &timeout, + ) + .await + { + Ok(()) => (), + Err(poll::Error::TimedOut(elapsed)) => { + panic!("no inventory collection found within {elapsed:?}"); + } + Err(poll::Error::PermanentError(err)) => { + panic!( + "failed waiting for inventory collection: {}", + InlineErrorChain::new(&err) + ); + } + } + } + pub async fn teardown(mut self) { self.server.close().await; self.database.cleanup().await.unwrap(); diff --git a/nexus/tests/integration_tests/rack.rs b/nexus/tests/integration_tests/rack.rs index 6465e915fd3..9eebe3d2130 100644 --- a/nexus/tests/integration_tests/rack.rs +++ b/nexus/tests/integration_tests/rack.rs @@ -23,6 +23,7 @@ use nexus_types::internal_api::params::SledAgentInfo; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; use omicron_uuid_kinds::GenericUuid; +use std::time::Duration; use uuid::Uuid; type ControlPlaneTestContext = @@ -98,6 +99,12 @@ async fn test_rack_initialization(cptestctx: &ControlPlaneTestContext) { #[nexus_test] async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { + // Setup: wait until we've collected an inventory from the system set + // up by `#[nexus_test]. + cptestctx + .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) + .await; + let internal_client = &cptestctx.internal_client; let external_client = &cptestctx.external_client; let list_url = "/v1/system/hardware/sleds-uninitialized"; @@ -159,6 +166,12 @@ async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { #[nexus_test] async fn test_sled_add(cptestctx: &ControlPlaneTestContext) { + // Setup: wait until we've collected an inventory from the system set + // up by `#[nexus_test]. + cptestctx + .wait_for_at_least_one_inventory_collection(Duration::from_secs(60)) + .await; + let external_client = &cptestctx.external_client; let list_url = "/v1/system/hardware/sleds-uninitialized"; let mut uninitialized_sleds = diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index d7f2daaf53b..0130bee4339 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -36,6 +36,7 @@ use omicron_common::api::external::ByteCount; pub use omicron_common::api::internal::shared::NetworkInterface; pub use omicron_common::api::internal::shared::NetworkInterfaceKind; pub use omicron_common::api::internal::shared::SourceNatConfig; +use omicron_common::disk::M2Slot; pub use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::DatasetUuid; @@ -49,6 +50,7 @@ use std::collections::BTreeSet; use std::net::SocketAddrV6; use std::sync::Arc; use strum::EnumIter; +use tufaceous_artifact::ArtifactHash; mod display; @@ -103,6 +105,14 @@ pub struct Collection { /// table. #[serde_as(as = "Vec<(_, _)>")] pub sps: BTreeMap, ServiceProcessor>, + /// all host phase 1 flash hashes, keyed first by the phase 1 slot, then the + /// baseboard id of the sled where they were found + /// + /// In practice, these will be inserted into the + /// `inv_host_phase_1_flash_hash` table. + #[serde_as(as = "BTreeMap<_, Vec<(_, _)>>")] + pub host_phase_1_flash_hashes: + BTreeMap, HostPhase1FlashHash>>, /// all roots of trust, keyed by baseboard id /// /// In practice, these will be inserted into the `inv_root_of_trust` table. @@ -160,6 +170,16 @@ pub struct Collection { } impl Collection { + pub fn host_phase_1_flash_hash_for( + &self, + slot: M2Slot, + baseboard_id: &BaseboardId, + ) -> Option<&HostPhase1FlashHash> { + self.host_phase_1_flash_hashes + .get(&slot) + .and_then(|by_bb| by_bb.get(baseboard_id)) + } + pub fn caboose_for( &self, which: CabooseWhich, @@ -386,6 +406,18 @@ pub struct RotState { pub stage0next_error: Option, } +/// Describes a host phase 1 flash hash found from a service processor +/// during collection +#[derive( + Clone, Debug, Ord, Eq, PartialOrd, PartialEq, Deserialize, Serialize, +)] +pub struct HostPhase1FlashHash { + pub time_collected: DateTime, + pub source: String, + pub slot: M2Slot, + pub hash: ArtifactHash, +} + /// Describes which caboose this is (which component, which slot) #[derive( Clone, diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index ffcdc775ce6..8bb7d2255f2 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -23,6 +23,7 @@ use nexus_sled_agent_shared::inventory::{ ConfigReconcilerInventoryStatus, HostPhase2DesiredContents, OmicronSledConfig, OmicronZoneImageSource, OrphanedDataset, }; +use omicron_common::disk::M2Slot; use omicron_uuid_kinds::{ DatasetUuid, OmicronZoneUuid, PhysicalDiskUuid, ZpoolUuid, }; @@ -342,7 +343,38 @@ fn display_devices( write!(f, " (cubby {})", sp.sp_slot)?; } writeln!(f, "")?; - writeln!(f, " found at: {} from {}", sp.time_collected, sp.source)?; + writeln!( + f, + " found at: {} from {}", + sp.time_collected + .to_rfc3339_opts(SecondsFormat::Millis, /* use_z */ true), + sp.source + )?; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct HostPhase1FlashHashRow { + slot: String, + hash: String, + } + + writeln!(f, " host phase 1 hashes:")?; + let host_phase1_hash_rows: Vec<_> = M2Slot::iter() + .filter_map(|s| { + collection + .host_phase_1_flash_hash_for(s, baseboard_id) + .map(|h| (s, h)) + }) + .map(|(slot, phase1)| HostPhase1FlashHashRow { + slot: format!("{slot:?}"), + hash: phase1.hash.to_string(), + }) + .collect(); + let table = tabled::Table::new(host_phase1_hash_rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + writeln!(f, "{}", textwrap::indent(&table.to_string(), " "))?; #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] diff --git a/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql b/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql new file mode 100644 index 00000000000..babdce1dd94 --- /dev/null +++ b/schema/crdb/add-inv-host-phase-1-flash-hash/up1.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.hw_m2_slot AS ENUM ( + 'A', + 'B' +); diff --git a/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql b/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql new file mode 100644 index 00000000000..77f1ed1aff9 --- /dev/null +++ b/schema/crdb/add-inv-host-phase-1-flash-hash/up2.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_host_phase_1_flash_hash ( + inv_collection_id UUID NOT NULL, + hw_baseboard_id UUID NOT NULL, + time_collected TIMESTAMPTZ NOT NULL, + source TEXT NOT NULL, + slot omicron.public.hw_m2_slot NOT NULL, + hash STRING(64) NOT NULL, + PRIMARY KEY (inv_collection_id, hw_baseboard_id, slot) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5ba503fc8a3..438a46e4a54 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3508,6 +3508,35 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_root_of_trust ( PRIMARY KEY (inv_collection_id, hw_baseboard_id) ); +-- host phase 1 slots +CREATE TYPE IF NOT EXISTS omicron.public.hw_m2_slot AS ENUM ( + 'A', + 'B' +); + +-- host phase 1 flash hashes found +-- There are usually two rows here for each row in inv_service_processor, but +-- not necessarily (either or both slots' hash collection may fail). +CREATE TABLE IF NOT EXISTS omicron.public.inv_host_phase_1_flash_hash ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- which system this SP reports it is part of + -- (foreign key into `hw_baseboard_id` table) + hw_baseboard_id UUID NOT NULL, + -- when this observation was made + time_collected TIMESTAMPTZ NOT NULL, + -- which MGS instance reported this data + source TEXT NOT NULL, + + -- phase 1 slot for this hash + slot omicron.public.hw_m2_slot NOT NULL, + -- the actual hash of the contents + hash STRING(64) NOT NULL, + + PRIMARY KEY (inv_collection_id, hw_baseboard_id, slot) +); + CREATE TYPE IF NOT EXISTS omicron.public.caboose_which AS ENUM ( 'sp_slot_0', 'sp_slot_1', @@ -6237,7 +6266,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '167.0.0', NULL) + (TRUE, NOW(), NOW(), '168.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sp-sim/src/update.rs b/sp-sim/src/update.rs index 92e582f5b13..a5765189a79 100644 --- a/sp-sim/src/update.rs +++ b/sp-sim/src/update.rs @@ -46,9 +46,9 @@ pub(crate) struct SimSpUpdate { /// state of hashing each of the host phase1 slots phase1_hash_state: BTreeMap, /// how do we decide when we're done hashing host phase1 slots? this allows - /// us to default to `TIME_TO_HASH_HOST_PHASE_1` (e.g., for running sp-sim - /// as a part of `omicron-dev`) while giving tests that want explicit - /// control the ability to precisely trigger completion of hashing. + /// us to default to "instant" (e.g., for running sp-sim as a part of + /// `omicron-dev`) while giving tests that want explicit control the ability + /// to precisely trigger completion of hashing. phase1_hash_policy: HostFlashHashPolicyInner, /// records whether a change to the stage0 "active slot" has been requested