Skip to content

Add host phase 1 flash hashes to inventory #8624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jul 21, 2025
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions clients/gateway-client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ serde_json.workspace = true
schemars.workspace = true
slog.workspace = true
thiserror.workspace = true
tokio.workspace = true
uuid.workspace = true
omicron-workspace-hack.workspace = true
101 changes: 101 additions & 0 deletions clients/gateway-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
//! Interface for API requests to a Management Gateway Service (MGS) instance

pub use gateway_messages::SpComponent;
use std::time::Duration;
use std::time::Instant;
use types::ComponentFirmwareHashStatus;

// We specifically want to allow consumers, such as `wicketd`, to embed
// inventory datatypes into their own APIs, rather than recreate structs.
Expand Down Expand Up @@ -97,3 +100,101 @@ impl PartialOrd for crate::types::SpIdentifier {
Some(self.cmp(other))
}
}

#[derive(Debug, thiserror::Error)]
pub enum HostPhase1HashError {
#[error("timed out waiting for hash calculation")]
Timeout,
#[error("hash calculation failed (phase1 written while hashing?)")]
ContentsModifiedWhileHashing,
#[error("failed to send request to {kind}")]
RequestError {
kind: &'static str,
#[source]
err: Error<types::Error>,
},
}

impl Client {
/// Get the hash of the host phase 1 flash contents in the given slot.
///
/// This operation is implemented asynchronously on the SP: a client (us)
/// must request the hash be calculated, then poll until the calculation is
/// complete. This method takes care of the "start / poll" operation; the
/// caller must provide a timeout for how long they're willing to wait for
/// the calculation to complete. In practice, we expect this to take a
/// handful of seconds on real hardware.
pub async fn host_phase_1_flash_hash_calculate_with_timeout(
&self,
sp: types::SpIdentifier,
phase1_slot: u16,
timeout: Duration,
) -> Result<[u8; 32], HostPhase1HashError> {
const SLEEP_BETWEEN_POLLS: Duration = Duration::from_secs(1);
const PHASE1_FLASH: &str =
SpComponent::HOST_CPU_BOOT_FLASH.const_as_str();

let need_to_start_hashing = match self
.sp_component_hash_firmware_get(
sp.type_,
sp.slot,
PHASE1_FLASH,
phase1_slot,
)
.await
.map_err(|err| HostPhase1HashError::RequestError {
kind: "get hash",
err,
})?
.into_inner()
{
ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash),
ComponentFirmwareHashStatus::HashInProgress => false,
ComponentFirmwareHashStatus::HashNotCalculated => true,
};

if need_to_start_hashing {
self.sp_component_hash_firmware_start(
sp.type_,
sp.slot,
PHASE1_FLASH,
phase1_slot,
)
.await
.map_err(|err| HostPhase1HashError::RequestError {
kind: "start hashing",
err,
})?;
}

let start = Instant::now();
loop {
tokio::time::sleep(SLEEP_BETWEEN_POLLS).await;
if start.elapsed() > timeout {
return Err(HostPhase1HashError::Timeout);
}
match self
.sp_component_hash_firmware_get(
sp.type_,
sp.slot,
PHASE1_FLASH,
phase1_slot,
)
.await
.map_err(|err| HostPhase1HashError::RequestError {
kind: "get hash",
err,
})?
.into_inner()
{
ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash),
ComponentFirmwareHashStatus::HashInProgress => continue,
ComponentFirmwareHashStatus::HashNotCalculated => {
return Err(
HostPhase1HashError::ContentsModifiedWhileHashing,
);
}
}
}
}
}
1 change: 1 addition & 0 deletions common/src/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,7 @@ impl DiskManagementError {
Deserialize,
Serialize,
JsonSchema,
strum::EnumIter,
)]
pub enum M2Slot {
A,
Expand Down
35 changes: 33 additions & 2 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ use omicron_common::api::external::DataPageParams;
use omicron_common::api::external::Generation;
use omicron_common::api::external::InstanceState;
use omicron_common::api::external::MacAddr;
use omicron_common::disk::M2Slot;
use omicron_uuid_kinds::CollectionUuid;
use omicron_uuid_kinds::DatasetUuid;
use omicron_uuid_kinds::DownstairsRegionUuid;
Expand Down Expand Up @@ -7211,7 +7212,36 @@ async fn inv_collection_print_devices(
print!(" (cubby {})", sp.sp_slot);
}
println!("");
println!(" found at: {} from {}", sp.time_collected, sp.source);
println!(
" found at: {} from {}",
sp.time_collected.to_rfc3339_opts(SecondsFormat::Secs, true),
sp.source
);

#[derive(Tabled)]
#[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
struct HostPhase1FlashHashRow {
slot: String,
hash: String,
}

println!(" host phase 1 hashes:");
let host_phase1_hash_rows: Vec<_> = M2Slot::iter()
.filter_map(|s| {
collection
.host_phase_1_flash_hash_for(s, baseboard_id)
.map(|h| (s, h))
})
.map(|(slot, phase1)| HostPhase1FlashHashRow {
slot: format!("{slot:?}"),
hash: phase1.hash.to_string(),
})
.collect();
let table = tabled::Table::new(host_phase1_hash_rows)
.with(tabled::settings::Style::empty())
.with(tabled::settings::Padding::new(0, 1, 0, 0))
.to_string();
println!("{}", textwrap::indent(&table.to_string(), " "));

#[derive(Tabled)]
#[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
Expand Down Expand Up @@ -7356,7 +7386,8 @@ fn inv_collection_print_sleds(collection: &Collection) {
);
println!(
" found at: {} from {}",
sled.time_collected, sled.source
sled.time_collected.to_rfc3339_opts(SecondsFormat::Secs, true),
sled.source
);
println!(" address: {}", sled.sled_agent_address);
println!(" usable hw threads: {}", sled.usable_hardware_threads);
Expand Down
26 changes: 18 additions & 8 deletions dev-tools/omdb/tests/test_all_output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ use nexus_test_utils_macros::nexus_test;
use nexus_types::deployment::Blueprint;
use nexus_types::deployment::SledFilter;
use nexus_types::deployment::UnstableReconfiguratorState;
use omicron_common::api::external::SwitchLocation;
use omicron_test_utils::dev::test_cmds::Redactor;
use omicron_test_utils::dev::test_cmds::path_to_executable;
use omicron_test_utils::dev::test_cmds::run_command;
use slog_error_chain::InlineErrorChain;
use std::fmt::Write;
use std::net::IpAddr;
use std::path::Path;
use std::time::Duration;
use subprocess::Exec;
use uuid::Uuid;

Expand Down Expand Up @@ -131,17 +133,20 @@ async fn test_omdb_usage_errors() {
async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
clear_omdb_env();

let gwtestctx = gateway_test_utils::setup::test_setup(
"test_omdb_success_case",
gateway_messages::SpPort::One,
)
.await;
let cmd_path = path_to_executable(CMD_OMDB);

let postgres_url = cptestctx.database.listen_url();
let nexus_internal_url =
format!("http://{}/", cptestctx.internal_client.bind_address);
let mgs_url = format!("http://{}/", gwtestctx.client.bind_address);
let mgs_url = format!(
"http://{}/",
cptestctx
.gateway
.get(&SwitchLocation::Switch0)
.expect("nexus_test always sets up MGS on switch 0")
.client
.bind_address
);
let ox_url = format!("http://{}/", cptestctx.oximeter.server_address());
let ox_test_producer = cptestctx.producer.address().ip();
let ch_url = format!("http://{}/", cptestctx.clickhouse.http_address());
Expand All @@ -165,6 +170,13 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
)
.await;

// Wait for Nexus to have gathered at least one inventory collection. (We'll
// check below that `reconfigurator export` contains at least one, so have
// to wait until there's one to export.)
cptestctx
.wait_for_at_least_one_inventory_collection(Duration::from_secs(60))
.await;

let mut output = String::new();

let invocations: &[&[&str]] = &[
Expand Down Expand Up @@ -319,8 +331,6 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
&ox_url,
ox_test_producer,
);

gwtestctx.teardown().await;
}

/// Verify that we properly deal with cases where:
Expand Down
46 changes: 45 additions & 1 deletion nexus/db-model/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use nexus_db_schema::schema::inv_zone_manifest_zone;
use nexus_db_schema::schema::{
hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership,
inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset,
inv_last_reconciliation_dataset_result,
inv_host_phase_1_flash_hash, inv_last_reconciliation_dataset_result,
inv_last_reconciliation_disk_result,
inv_last_reconciliation_orphaned_dataset,
inv_last_reconciliation_zone_result, inv_mupdate_override_non_boot,
Expand Down Expand Up @@ -158,6 +158,36 @@ impl From<HwRotSlot> for RotSlot {
}
}

// See [`M2Slot`].
impl_enum_type!(
HwHostPhase1SlotEnum:

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)]
pub enum HwHostPhase1Slot;
Copy link
Contributor

@sunshowers sunshowers Jul 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts on just calling it something like M2Slot so the enum can be used everywhere we need something like this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry if this is rambly - thinking out loud. I definitely want to use something like this in other places (even on this PR!). I worry slightly about conflating two strongly-related-but-slightly-different things:

  1. The SP exposes exactly 2 host phase 1 slots.
  2. gimlet and cosmo have two M.2 drives, each of which is directly associated with one of the two host phase 1 slots

Things I have vague worry might change in the future:

  1. Will future sleds keep using the M.2 form factor? (This is just a naming concern, but it's why in sled-agent I'd started to use "internal disk" instead of "M.2". Maybe this is overthinking it.)
  2. The M.2 drives have multiple partitions, and even multiple boot partitions. sled-agent currently treats partition 0 as the boot partition and partitions 1, 2, and 3 as reserved, but that doesn't match reality: partition 1 is also a boot partition, and pilot knows how to write to it. (We use this as a backup in manufacturing sometimes; if something gets screwed up in partition 0, we can still boot if there's a valid OS image in partition 1.) sled-agent eventually needs to learn what's going on here (or we need to change the OS behavior); at the moment we could theoretically boot out of partition 1 instead of 0 and be very confused.

Point 2 is a really long-winded way of saying "I'm not sure M2Slot is sufficient to represent the sled-agent side of this", and we might want to expand that to something that holds both the M2Slot and which partition it was. But maybe even still we keep using this enum for both phase 1 and "which disk is it"?

I think I'm coming around to "just call this M2Slot" since that's what we use everywhere else. If we need to change it in the future, we'll have a better idea then of what to rename it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed to HwM2Slot in bdbd14d


// Enum values
A => b"A"
B => b"B"
);

impl From<HwHostPhase1Slot> for M2Slot {
fn from(value: HwHostPhase1Slot) -> Self {
match value {
HwHostPhase1Slot::A => Self::A,
HwHostPhase1Slot::B => Self::B,
}
}
}

impl From<M2Slot> for HwHostPhase1Slot {
fn from(value: M2Slot) -> Self {
match value {
M2Slot::A => Self::A,
M2Slot::B => Self::B,
}
}
}

// See [`nexus_types::inventory::CabooseWhich`].
impl_enum_type!(
CabooseWhichEnum:
Expand Down Expand Up @@ -752,6 +782,19 @@ impl From<InvRootOfTrust> for nexus_types::inventory::RotState {
}
}

/// See [`nexus_types::inventory::HostPhase1FlashHash`].
#[derive(Queryable, Clone, Debug, Selectable)]
#[diesel(table_name = inv_host_phase_1_flash_hash)]
pub struct InvHostPhase1FlashHash {
pub inv_collection_id: Uuid,
pub hw_baseboard_id: Uuid,
pub time_collected: DateTime<Utc>,
pub source: String,

pub slot: HwHostPhase1Slot,
pub hash: ArtifactHash,
}

/// See [`nexus_types::inventory::CabooseFound`].
#[derive(Queryable, Clone, Debug, Selectable)]
#[diesel(table_name = inv_caboose)]
Expand Down Expand Up @@ -966,6 +1009,7 @@ impl InvSledConfigReconciler {
boot_partition_a_error: Option<String>,
boot_partition_b_error: Option<String>,
) -> Self {
// TODO-john replace this column with the hw_host_phase_1_slot enum?
let (boot_disk_slot, boot_disk_error) = match boot_disk {
Ok(M2Slot::A) => (Some(SqlU8(0)), None),
Ok(M2Slot::B) => (Some(SqlU8(1)), None),
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(164, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(165, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(165, "add-inv-host-phase-1-flash-hash"),
KnownVersion::new(164, "fix-leaked-bp-oximeter-read-policy-rows"),
KnownVersion::new(163, "bp-desired-host-phase-2"),
KnownVersion::new(162, "bundle-by-creation"),
Expand Down
Loading
Loading