oxidecomputer · jgallagher · Jul 21, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 16, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml
@@ -23,5 +23,6 @@ serde_json.workspace = true
 schemars.workspace = true
 slog.workspace = true
 thiserror.workspace = true
+tokio.workspace = true
 uuid.workspace = true
 omicron-workspace-hack.workspace = true
diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs
@@ -7,6 +7,9 @@
 //! Interface for API requests to a Management Gateway Service (MGS) instance
 
 pub use gateway_messages::SpComponent;
+use std::time::Duration;
+use std::time::Instant;
+use types::ComponentFirmwareHashStatus;
 
 // We specifically want to allow consumers, such as `wicketd`, to embed
 // inventory datatypes into their own APIs, rather than recreate structs.
@@ -97,3 +100,131 @@ impl PartialOrd for crate::types::SpIdentifier {
         Some(self.cmp(other))
     }
 }
+
+#[derive(Debug, thiserror::Error)]
+pub enum HostPhase1HashError {
+    #[error("timed out waiting for hash calculation")]
+    Timeout,
+    #[error("hash calculation failed (phase1 written while hashing?)")]
+    ContentsModifiedWhileHashing,
+    #[error("failed to send request to {kind}")]
+    RequestError {
+        kind: &'static str,
+        #[source]
+        err: Error<types::Error>,
+    },
+}
+
+impl Client {
+    /// Get the hash of the host phase 1 flash contents in the given slot.
+    ///
+    /// This operation is implemented asynchronously on the SP: a client (us)
+    /// must request the hash be calculated, then poll until the calculation is
+    /// complete. This method takes care of the "start / poll" operation; the
+    /// caller must provide a timeout for how long they're willing to wait for
+    /// the calculation to complete. In practice, we expect this to take a
+    /// handful of seconds on real hardware.
+    pub async fn host_phase_1_flash_hash_calculate_with_timeout(
+        &self,
+        sp: types::SpIdentifier,
+        phase1_slot: u16,
+        timeout: Duration,
+    ) -> Result<[u8; 32], HostPhase1HashError> {
+        // The most common cases of calling this function are:
+        //
+        // 1. The hash is already calculated; we get it in the first `get`
+        //    operation below and return after a single request to MGS.
+        // 2. The hash needs to be recalculated; we'll issue a "start hashing"
+        //    request then go into the polling loop. We expect to sit in that
+        //    loop for a handful of seconds.
+        //
+        // Given these, we could make this poll duration longer, since we know
+        // the operation takes a little while. But there are two arguments for
+        // polling somewhat more frequently:
+        //
+        // 1. Timeouts, timeouts, always wrong; if we believe hashing takes (by
+        //    way of example) 7 seconds, so we set the timeout to something
+        //    slightly larger than that (say 10 seconds), if a real device takes
+        //    slightly longer than our timeout, we now wait 20 seconds.
+        // 2. An uncommon case of calling this function is that our initial
+        //    `get` returns `HashInProgress`; in this case we have no idea how
+        //    long the hashing has already been running, so would not know how
+        //    long to try to wait.
+        //
+        // It should be pretty cheap to poll the SP at 1 Hz, so we sidestep both
+        // of those issues by doing so.
+        const SLEEP_BETWEEN_POLLS: Duration = Duration::from_secs(1);
+        const PHASE1_FLASH: &str =
+            SpComponent::HOST_CPU_BOOT_FLASH.const_as_str();
+
+        let need_to_start_hashing = match self
+            .sp_component_hash_firmware_get(
+                sp.type_,
+                sp.slot,
+                PHASE1_FLASH,
+                phase1_slot,
+            )
+            .await
+            .map_err(|err| HostPhase1HashError::RequestError {
+                kind: "get hash",
+                err,
+            })?
+            .into_inner()
+        {
+            ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash),
+            ComponentFirmwareHashStatus::HashInProgress => false,
+            ComponentFirmwareHashStatus::HashNotCalculated => true,
+        };
+
+        if need_to_start_hashing {
+            // It's possible multiple Nexus instances race, all see
+            // `HashNotCalculated` above, then all try to start hashing here.
+            // The SP will accept the first request and return a
+            // `HashInProgress` error for subsequent attempts, but MGS does its
+            // best to make this operation idempotent; in particular, it will
+            // catch a `HashInProgress` error here and return an HTTP success.
+            // We'll return any other error.
+            self.sp_component_hash_firmware_start(
+                sp.type_,
+                sp.slot,
+                PHASE1_FLASH,
+                phase1_slot,
+            )
+            .await
+            .map_err(|err| HostPhase1HashError::RequestError {
+                kind: "start hashing",
+                err,
+            })?;
+        }
+
+        let start = Instant::now();
+        loop {
+            tokio::time::sleep(SLEEP_BETWEEN_POLLS).await;
+            if start.elapsed() > timeout {
+                return Err(HostPhase1HashError::Timeout);
+            }
+            match self
+                .sp_component_hash_firmware_get(
+                    sp.type_,
+                    sp.slot,
+                    PHASE1_FLASH,
+                    phase1_slot,
+                )
+                .await
+                .map_err(|err| HostPhase1HashError::RequestError {
+                    kind: "get hash",
+                    err,
+                })?
+                .into_inner()
+            {
+                ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash),
+                ComponentFirmwareHashStatus::HashInProgress => continue,
+                ComponentFirmwareHashStatus::HashNotCalculated => {
+                    return Err(
+                        HostPhase1HashError::ContentsModifiedWhileHashing,
+                    );
+                }
+            }
+        }
+    }
+}
diff --git a/common/src/disk.rs b/common/src/disk.rs
@@ -619,6 +619,7 @@ impl DiskManagementError {
     Deserialize,
     Serialize,
     JsonSchema,
+    strum::EnumIter,
 )]
 pub enum M2Slot {
     A,

diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs
@@ -16,13 +16,15 @@ use nexus_test_utils_macros::nexus_test;
 use nexus_types::deployment::Blueprint;
 use nexus_types::deployment::SledFilter;
 use nexus_types::deployment::UnstableReconfiguratorState;
+use omicron_common::api::external::SwitchLocation;
 use omicron_test_utils::dev::test_cmds::Redactor;
 use omicron_test_utils::dev::test_cmds::path_to_executable;
 use omicron_test_utils::dev::test_cmds::run_command;
 use slog_error_chain::InlineErrorChain;
 use std::fmt::Write;
 use std::net::IpAddr;
 use std::path::Path;
+use std::time::Duration;
 use subprocess::Exec;
 use uuid::Uuid;
 
@@ -131,17 +133,20 @@ async fn test_omdb_usage_errors() {
 async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
     clear_omdb_env();
 
-    let gwtestctx = gateway_test_utils::setup::test_setup(
-        "test_omdb_success_case",
-        gateway_messages::SpPort::One,
-    )
-    .await;
     let cmd_path = path_to_executable(CMD_OMDB);
 
     let postgres_url = cptestctx.database.listen_url();
     let nexus_internal_url =
         format!("http://{}/", cptestctx.internal_client.bind_address);
-    let mgs_url = format!("http://{}/", gwtestctx.client.bind_address);
+    let mgs_url = format!(
+        "http://{}/",
+        cptestctx
+            .gateway
+            .get(&SwitchLocation::Switch0)
+            .expect("nexus_test always sets up MGS on switch 0")
+            .client
+            .bind_address
+    );
     let ox_url = format!("http://{}/", cptestctx.oximeter.server_address());
     let ox_test_producer = cptestctx.producer.address().ip();
     let ch_url = format!("http://{}/", cptestctx.clickhouse.http_address());
@@ -165,6 +170,13 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
     )
     .await;
 
+    // Wait for Nexus to have gathered at least one inventory collection. (We'll
+    // check below that `reconfigurator export` contains at least one, so have
+    // to wait until there's one to export.)
+    cptestctx
+        .wait_for_at_least_one_inventory_collection(Duration::from_secs(60))
+        .await;
+
     let mut output = String::new();
 
     let invocations: &[&[&str]] = &[
@@ -319,8 +331,6 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
         &ox_url,
         ox_test_producer,
     );
-
-    gwtestctx.teardown().await;
 }
 
 /// Verify that we properly deal with cases where:

diff --git a/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt b/dev-tools/reconfigurator-cli/tests/input/cmds-example.txt
@@ -52,7 +52,7 @@ load-example --seed test-basic --nsleds 3 --sled-policy 1:non-provisionable --sl
 
 blueprint-list
 blueprint-show latest
-inventory-show latest
+inventory-show latest all
 
 # Plan a blueprint run -- this will cause zones and disks on the expunged
 # sled to be expunged.

diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout
@@ -1013,13 +1013,79 @@ parent:    02697f74-b14a-4418-90f0-c28b2a3a6aa9
  PENDING MGS-MANAGED UPDATES: 0
 
 
-> inventory-show latest
+> inventory-show latest all
 collection: 9e187896-7809-46d0-9210-d75be1b3c4d4
 collector:  example
 started:    <REDACTED_TIMESTAMP>
 done:       <REDACTED_TIMESTAMP>
 errors:     0
 
+Sled serial0
+    part number: model0
+    power:    A2
+    revision: 0
+    MGS slot: Sled 0 (cubby 0)
+    found at: <REDACTED_TIMESTAMP> from fake MGS 1
+    host phase 1 hashes:
+        SLOT HASH                                                             
+        A    0101010101010101010101010101010101010101010101010101010101010101 
+        B    0202020202020202020202020202020202020202020202020202020202020202 
+    cabooses:
+        SLOT    BOARD       NAME        VERSION GIT_COMMIT SIGN 
+        SpSlot0 SimGimletSp SimGimletSp 0.0.1   unknown    n/a  
+    RoT pages:
+        SLOT DATA_BASE64 
+    RoT: active slot: slot A
+    RoT: persistent boot preference: slot A
+    RoT: pending persistent boot preference: -
+    RoT: transient boot preference: -
+    RoT: slot A SHA3-256: slotAdigest1
+    RoT: slot B SHA3-256: slotBdigest1
+
+Sled serial1
+    part number: model1
+    power:    A2
+    revision: 0
+    MGS slot: Sled 1 (cubby 1)
+    found at: <REDACTED_TIMESTAMP> from fake MGS 1
+    host phase 1 hashes:
+        SLOT HASH                                                             
+        A    0101010101010101010101010101010101010101010101010101010101010101 
+        B    0202020202020202020202020202020202020202020202020202020202020202 
+    cabooses:
+        SLOT    BOARD       NAME        VERSION GIT_COMMIT SIGN 
+        SpSlot0 SimGimletSp SimGimletSp 0.0.1   unknown    n/a  
+    RoT pages:
+        SLOT DATA_BASE64 
+    RoT: active slot: slot A
+    RoT: persistent boot preference: slot A
+    RoT: pending persistent boot preference: -
+    RoT: transient boot preference: -
+    RoT: slot A SHA3-256: slotAdigest1
+    RoT: slot B SHA3-256: slotBdigest1
+
+Sled serial2
+    part number: model2
+    power:    A2
+    revision: 0
+    MGS slot: Sled 2 (cubby 2)
+    found at: <REDACTED_TIMESTAMP> from fake MGS 1
+    host phase 1 hashes:
+        SLOT HASH                                                             
+        A    0101010101010101010101010101010101010101010101010101010101010101 
+        B    0202020202020202020202020202020202020202020202020202020202020202 
+    cabooses:
+        SLOT    BOARD       NAME        VERSION GIT_COMMIT SIGN 
+        SpSlot0 SimGimletSp SimGimletSp 0.0.1   unknown    n/a  
+    RoT pages:
+        SLOT DATA_BASE64 
+    RoT: active slot: slot A
+    RoT: persistent boot preference: slot A
+    RoT: pending persistent boot preference: -
+    RoT: transient boot preference: -
+    RoT: slot A SHA3-256: slotAdigest1
+    RoT: slot B SHA3-256: slotBdigest1
+
 SLED AGENTS
 
 sled 2eb69596-f081-4e2d-9425-9994926e0832 (role = Gimlet, serial serial1)