diff --git a/Cargo.lock b/Cargo.lock index 9db98afdd45..d8f2ad0fc80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6509,6 +6509,7 @@ dependencies = [ "internal-dns-types", "nexus-types", "omicron-common", + "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", "qorb", diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index c4b16790362..8219f7d34b2 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -60,6 +60,7 @@ progenitor::generate_api!( }), derives = [schemars::JsonSchema], patch = { + ComponentFirmwareHashStatus = { derives = [PartialEq, Eq, PartialOrd, Ord] }, HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] }, ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] }, RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] }, diff --git a/gateway-api/src/lib.rs b/gateway-api/src/lib.rs index 8b0c5a6567b..18d068cfb7f 100644 --- a/gateway-api/src/lib.rs +++ b/gateway-api/src/lib.rs @@ -16,7 +16,7 @@ use gateway_types::{ SpState, }, component_details::SpComponentDetails, - host::HostStartupOptions, + host::{ComponentFirmwareHashStatus, HostStartupOptions}, ignition::{IgnitionCommand, SpIgnitionInfo}, rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState}, sensor::SpSensorReading, @@ -259,6 +259,41 @@ pub trait GatewayApi { path: Path, ) -> Result, HttpError>; + /// Start computing the hash of a given slot of a component. + /// + /// This endpoint is only valid for the `host-boot-flash` component. + /// + /// Computing the hash takes several seconds; callers should poll for results + /// using `sp_component_hash_firmware_get()`. In general they should call + /// `sp_component_hash_firmware_get()` first anyway, as the hashes are + /// cached in the SP and may already be ready. + #[endpoint { + method = POST, + path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}", + }] + async fn sp_component_hash_firmware_start( + rqctx: RequestContext, + path: Path, + ) -> Result; + + /// Get a computed hash of a given slot of a component. + /// + /// This endpoint is only valid for the `host-boot-flash` component. + /// + /// Computing the hash takes several seconds; this endpoint returns the + /// current status. If the status is `HashNotStarted`, callers should start + /// hashing using `sp_component_hash_firmware_start()`. If the status is + /// `HashInProgress`, callers should wait a bit then call this endpoint + /// again. + #[endpoint { + method = GET, + path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}", + }] + async fn sp_component_hash_firmware_get( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError>; + /// Abort any in-progress update an SP component /// /// Aborting an update to the SP itself is done via the component name @@ -542,6 +577,19 @@ pub struct PathSpComponent { pub component: String, } +#[derive(Deserialize, JsonSchema)] +pub struct PathSpComponentFirmwareSlot { + /// ID for the SP that the gateway service translates into the appropriate + /// port for communicating with the given SP. + #[serde(flatten)] + pub sp: SpIdentifier, + /// ID for the component of the SP; this is the internal identifier used by + /// the SP itself to identify its components. + pub component: String, + /// Firmware slot of the component. + pub firmware_slot: u16, +} + #[derive(Deserialize, JsonSchema)] pub struct PathSpTaskDumpIndex { /// ID for the SP that the gateway service translates into the appropriate diff --git a/gateway-types/src/host.rs b/gateway-types/src/host.rs index 73130b0a1aa..c6bf9d30920 100644 --- a/gateway-types/src/host.rs +++ b/gateway-types/src/host.rs @@ -56,3 +56,18 @@ impl From for HostStartupOptions { } } } + +#[derive(Serialize, Deserialize, JsonSchema)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum ComponentFirmwareHashStatus { + /// The hash is not available; the client must issue a separate request to + /// begin calculating the hash. + HashNotCalculated, + /// The hash is currently being calculated; the client should sleep briefly + /// then check again. + /// + /// We expect this operation to take a handful of seconds in practice. + HashInProgress, + /// The hash of the given firmware slot. + Hashed { sha256: [u8; 32] }, +} diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 9123a749930..d45b738fba7 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -23,8 +23,10 @@ use dropshot::WebsocketEndpointResult; use dropshot::WebsocketUpgrade; use futures::TryFutureExt; use gateway_api::*; +use gateway_messages::HfError; use gateway_messages::RotBootInfo; use gateway_messages::SpComponent; +use gateway_messages::SpError; use gateway_sp_comms::HostPhase2Provider; use gateway_sp_comms::VersionedSpState; use gateway_sp_comms::error::CommunicationError; @@ -36,6 +38,7 @@ use gateway_types::component::SpComponentList; use gateway_types::component::SpIdentifier; use gateway_types::component::SpState; use gateway_types::component_details::SpComponentDetails; +use gateway_types::host::ComponentFirmwareHashStatus; use gateway_types::host::HostStartupOptions; use gateway_types::ignition::SpIgnitionInfo; use gateway_types::rot::RotCfpa; @@ -536,6 +539,93 @@ impl GatewayApi for GatewayImpl { apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } + async fn sp_component_hash_firmware_start( + rqctx: RequestContext, + path: Path, + ) -> Result { + let apictx = rqctx.context(); + + let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = + path.into_inner(); + let sp_id = sp.into(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + if component != SpComponent::HOST_CPU_BOOT_FLASH { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), + )); + } + + // The SP (reasonably!) returns a `HashInProgress` error if we try + // to start hashing while hashing is being calculated, but we're + // presenting an idempotent "start hashing if it isn't started" + // endpoint instead. Swallow that error. + match sp.start_host_flash_hash(firmware_slot).await { + Ok(()) + | Err(CommunicationError::SpError(SpError::Hf( + HfError::HashInProgress, + ))) => Ok(HttpResponseUpdatedNoContent()), + Err(err) => { + Err(SpCommsError::SpCommunicationFailed { sp: sp_id, err } + .into()) + } + } + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await + } + + async fn sp_component_hash_firmware_get( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + + let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = + path.into_inner(); + let sp_id = sp.into(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + if component != SpComponent::HOST_CPU_BOOT_FLASH { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), + )); + } + + let status = match sp.get_host_flash_hash(firmware_slot).await { + // success + Ok(sha256) => ComponentFirmwareHashStatus::Hashed { sha256 }, + + // expected failure: hash needs to be calculated (or + // recalculated; either way the client operation is the same) + Err(CommunicationError::SpError(SpError::Hf( + HfError::HashUncalculated | HfError::RecalculateHash, + ))) => ComponentFirmwareHashStatus::HashNotCalculated, + + // expected failure: hashing is currently in progress; client + // needs to wait and try again later + Err(CommunicationError::SpError(SpError::Hf( + HfError::HashInProgress, + ))) => ComponentFirmwareHashStatus::HashInProgress, + + // other errors are failures + Err(err) => { + return Err(HttpError::from( + SpCommsError::SpCommunicationFailed { sp: sp_id, err }, + )); + } + }; + + Ok(HttpResponseOk(status)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await + } + async fn sp_component_update_abort( rqctx: RequestContext, path: Path, diff --git a/nexus/mgs-updates/Cargo.toml b/nexus/mgs-updates/Cargo.toml index 9c47c14cf47..df65ddd07f7 100644 --- a/nexus/mgs-updates/Cargo.toml +++ b/nexus/mgs-updates/Cargo.toml @@ -41,6 +41,7 @@ dropshot.workspace = true gateway-messages.workspace = true gateway-test-utils.workspace = true hubtools.workspace = true +omicron-test-utils.workspace = true rand.workspace = true repo-depot-api.workspace = true sp-sim.workspace = true diff --git a/nexus/mgs-updates/tests/host_phase1_hash.rs b/nexus/mgs-updates/tests/host_phase1_hash.rs new file mode 100644 index 00000000000..0723c9f2e2d --- /dev/null +++ b/nexus/mgs-updates/tests/host_phase1_hash.rs @@ -0,0 +1,255 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Test host phase 1 hash flashing via MGS. +//! +//! This operation is implemented asynchronously on the SP side: we must first +//! ask it to start hashing, then we'll get "still hashing" errors for a few +//! seconds, then we'll get the hash result. + +use gateway_client::Client; +use gateway_client::SpComponent; +use gateway_client::types::ComponentFirmwareHashStatus; +use gateway_client::types::SpType; +use gateway_client::types::SpUpdateStatus; +use gateway_messages::SpPort; +use gateway_test_utils::setup as mgs_setup; +use omicron_test_utils::dev::poll::CondCheckError; +use omicron_test_utils::dev::poll::wait_for_condition; +use sha2::Digest as _; +use sha2::Sha256; +use sp_sim::SimulatedSp; +use std::time::Duration; +use uuid::Uuid; + +struct Phase1HashStatusChecker<'a> { + mgs_client: &'a Client, + sp_type: SpType, + sp_slot: u16, + sp_component: &'a str, +} + +impl Phase1HashStatusChecker<'_> { + async fn assert_status( + &self, + expected: &[(u16, ComponentFirmwareHashStatus)], + ) { + for (firmware_slot, expected_status) in expected { + let status = self + .mgs_client + .sp_component_hash_firmware_get( + self.sp_type, + self.sp_slot, + self.sp_component, + *firmware_slot, + ) + .await + .expect("got firmware hash status"); + assert_eq!( + status.into_inner(), + *expected_status, + "unexpected status for slot {firmware_slot}" + ); + } + } +} + +// This is primarily a test of the `sp-sim` implementation of host phase 1 +// flashing, with a minor side test that MGS's endpoints wrap it faithfully. +#[tokio::test] +async fn test_host_phase1_hashing() { + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_host_phase1_updater_updates_sled", + SpPort::One, + ) + .await; + + // We'll only talk to one sp-sim for this test. + let mgs_client = mgstestctx.client(); + let sp_sim = &mgstestctx.simrack.gimlets[0]; + let sp_type = SpType::Sled; + let sp_component = SpComponent::HOST_CPU_BOOT_FLASH.const_as_str(); + let sp_slot = 0; + let phase1_checker = Phase1HashStatusChecker { + mgs_client: &mgs_client, + sp_type, + sp_slot, + sp_component, + }; + + // We haven't yet started hashing; we should get the error we expect for + // both slots. + for firmware_slot in [0, 1] { + let status = mgs_client + .sp_component_hash_firmware_get( + sp_type, + sp_slot, + sp_component, + firmware_slot, + ) + .await + .expect("got firmware hash status"); + match status.into_inner() { + ComponentFirmwareHashStatus::HashNotCalculated => (), + other => panic!("unexpected status: {other:?}"), + } + } + + // We want explicit (i.e., not-timer-based) control over when hashing + // completes. + let hashing_complete_sender = + sp_sim.set_phase1_hash_policy_explicit_control().await; + + // Start hashing firmware slot 0. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 0) + .await + .expect("started firmware hashing"); + + // We should see the expected status; hash is computing in slot 0 and not + // yet started in slot 1. + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::HashInProgress), + (1, ComponentFirmwareHashStatus::HashNotCalculated), + ]) + .await; + + // We can start hashing firmware slot 0 again; this should be a no-op while + // hashing is being done. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 0) + .await + .expect("starting hashing while hashing should be okay"); + + // Calculate the hash we expect to see. + let expected_sha256_0 = Sha256::digest( + sp_sim.last_host_phase1_update_data(0).await.as_deref().unwrap_or(&[]), + ) + .into(); + + // Allow the next `get()` to succeed. + hashing_complete_sender.complete_next_hashing_attempt(); + + // We should see the expected status; hash is complete in slot 0 and not + // yet started in slot 1. + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::HashNotCalculated), + ]) + .await; + + // Repeat, but slot 1. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 1) + .await + .expect("started firmware hashing"); + hashing_complete_sender.complete_next_hashing_attempt(); + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + ]) + .await; + + // Upload a new, fake phase1 to slot 1. + let fake_phase1 = b"test_host_phase1_hashing_fake_data".as_slice(); + let expected_sha256_1 = Sha256::digest(fake_phase1).into(); + + // Drive the update to completion. + { + let update_id = Uuid::new_v4(); + mgs_client + .sp_component_update( + sp_type, + sp_slot, + sp_component, + 1, + &update_id, + fake_phase1, + ) + .await + .expect("started slot 1 update"); + wait_for_condition( + || async { + let update_status = mgs_client + .sp_component_update_status(sp_type, sp_slot, sp_component) + .await + .expect("got update status") + .into_inner(); + match update_status { + // expected terminal state + SpUpdateStatus::Complete { id } => { + if id == update_id { + Ok(()) + } else { + Err(CondCheckError::Failed(format!( + "unexpected complete ID \ + (got {id} expected {update_id})" + ))) + } + } + + // expected intermediate states + SpUpdateStatus::Preparing { .. } + | SpUpdateStatus::InProgress { .. } => { + Err(CondCheckError::NotYet) + } + + // never-expect-to-see states + SpUpdateStatus::None + | SpUpdateStatus::Aborted { .. } + | SpUpdateStatus::Failed { .. } + | SpUpdateStatus::RotError { .. } => { + Err(CondCheckError::Failed(format!( + "unexpected status: {update_status:?}" + ))) + } + } + }, + &Duration::from_millis(100), + &Duration::from_secs(30), + ) + .await + .expect("update to sp-sim completed within timeout"); + } + + // Confirm the simulator wrote the expected data in slot 1. + let slot_1_data = sp_sim.last_host_phase1_update_data(1).await.unwrap(); + assert_eq!(*slot_1_data, *fake_phase1); + + // Writing an update should have put slot 1 back into the "needs hashing" + // state. + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::HashNotCalculated), + ]) + .await; + + // Start hashing firmware slot 1. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 1) + .await + .expect("started firmware hashing"); + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::HashInProgress), + ]) + .await; + + // Allow hashing to complete. + hashing_complete_sender.complete_next_hashing_attempt(); + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::Hashed(expected_sha256_1)), + ]) + .await; + + mgstestctx.teardown().await; +} diff --git a/openapi/gateway.json b/openapi/gateway.json index 89ece0a0e73..c6fd494c0f1 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -716,6 +716,127 @@ } } }, + "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}": { + "get": { + "summary": "Get a computed hash of a given slot of a component.", + "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; callers can start hashing using `sp_component_hash_firmware_start()`.", + "operationId": "sp_component_hash_firmware_get", + "parameters": [ + { + "in": "path", + "name": "component", + "description": "ID for the component of the SP; this is the internal identifier used by the SP itself to identify its components.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "firmware_slot", + "description": "Firmware slot of the component.", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "slot", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "type", + "required": true, + "schema": { + "$ref": "#/components/schemas/SpType" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ComponentFirmwareHashStatus" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "post": { + "summary": "Start computing the hash of a given slot of a component.", + "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; callers can poll for results using `sp_component_hash_firmware_get()`.", + "operationId": "sp_component_hash_firmware_start", + "parameters": [ + { + "in": "path", + "name": "component", + "description": "ID for the component of the SP; this is the internal identifier used by the SP itself to identify its components.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "firmware_slot", + "description": "Firmware slot of the component.", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "slot", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "type", + "required": true, + "schema": { + "$ref": "#/components/schemas/SpType" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/sp/{type}/{slot}/component/{component}/reset": { "post": { "summary": "Reset an SP component (possibly the SP itself).", @@ -1697,6 +1818,66 @@ }, "components": { "schemas": { + "ComponentFirmwareHashStatus": { + "oneOf": [ + { + "description": "The hash is not available; the client must issue a separate request to begin calculating the hash.", + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "hash_not_calculated" + ] + } + }, + "required": [ + "status" + ] + }, + { + "description": "The hash is currently being calculated; the client should sleep briefly then check again.\n\nWe expect this operation to take a handful of seconds in practice.", + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "hash_in_progress" + ] + } + }, + "required": [ + "status" + ] + }, + { + "description": "The hash of the given firmware slot.", + "type": "object", + "properties": { + "sha256": { + "type": "array", + "items": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "minItems": 32, + "maxItems": 32 + }, + "status": { + "type": "string", + "enum": [ + "hashed" + ] + } + }, + "required": [ + "sha256", + "status" + ] + } + ] + }, "Duration": { "type": "object", "properties": { diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs index 74e569b30cc..c30a96a7850 100644 --- a/oximeter/instruments/src/http.rs +++ b/oximeter/instruments/src/http.rs @@ -6,7 +6,9 @@ // Copyright 2024 Oxide Computer Company -use dropshot::{HttpError, HttpResponse, RequestContext, ServerContext}; +use dropshot::{ + HttpResponse, HttpResponseError, RequestContext, ServerContext, +}; use futures::Future; use http::StatusCode; use oximeter::{ @@ -156,14 +158,15 @@ impl LatencyTracker { /// produces an expected `dropshot` response. This method runs and times the handler, records /// the latency in the appropriate timeseries, and forwards the result of the handler to the /// caller. - pub async fn instrument_dropshot_handler( + pub async fn instrument_dropshot_handler( &self, context: &RequestContext, handler: H, - ) -> Result + ) -> Result where R: HttpResponse, - H: Future>, + E: HttpResponseError, + H: Future>, T: ServerContext, { let start = Instant::now(); @@ -171,7 +174,7 @@ impl LatencyTracker { let latency = start.elapsed(); let status_code = match &result { Ok(response) => response.status_code(), - Err(ref e) => e.status_code.as_status(), + Err(ref e) => e.status_code().as_status(), }; if let Err(e) = self.update(&context.endpoint.operation_id, status_code, latency) diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index 8d21fc4cb67..4dc2e19d20d 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::HostFlashHashCompletionSender; use crate::Responsiveness; use crate::SimulatedSp; use crate::config::GimletConfig; @@ -404,6 +405,25 @@ impl Gimlet { pub fn last_request_handled(&self) -> Option { *self.last_request_handled.lock().unwrap() } + + /// Instead of host phase 1 hashing completing after a few seconds, return a + /// handle that can be used to explicitly trigger completion. + /// + /// # Panics + /// + /// Panics if this `Gimlet` was created with only an RoT instead of a full + /// SP + RoT complex. + pub async fn set_phase1_hash_policy_explicit_control( + &self, + ) -> HostFlashHashCompletionSender { + self.handler + .as_ref() + .expect("gimlet was created with SP config") + .lock() + .await + .update_state + .set_phase1_hash_policy_explicit_control() + } } struct SerialConsoleTcpTask { diff --git a/sp-sim/src/lib.rs b/sp-sim/src/lib.rs index d33444164ec..d439f9477ff 100644 --- a/sp-sim/src/lib.rs +++ b/sp-sim/src/lib.rs @@ -26,6 +26,7 @@ pub use slog::Logger; use std::net::SocketAddrV6; use tokio::sync::mpsc; use tokio::sync::watch; +pub use update::HostFlashHashCompletionSender; pub const SIM_ROT_BOARD: &str = "SimRot"; pub const SIM_ROT_STAGE0_BOARD: &str = "SimRotStage0"; diff --git a/sp-sim/src/update.rs b/sp-sim/src/update.rs index de13a2c2106..032f1491c5d 100644 --- a/sp-sim/src/update.rs +++ b/sp-sim/src/update.rs @@ -27,6 +27,7 @@ use hubtools::RawHubrisImage; use sha2::Sha256; use sha3::Digest; use sha3::Sha3_256; +use tokio::sync::mpsc; // How long do we take to hash host flash? Real SPs take a handful of seconds; // we'll pick something similar. @@ -48,6 +49,11 @@ pub(crate) struct SimSpUpdate { last_host_phase1_update_data: BTreeMap>, /// state of hashing each of the host phase1 slots phase1_hash_state: BTreeMap, + /// how do we decide when we're done hashing host phase1 slots? this allows + /// us to default to `TIME_TO_HASH_HOST_PHASE_1` (e.g., for running sp-sim + /// as a part of `omicron-dev`) while giving tests that want explicit + /// control the ability to precisely trigger completion of hashing. + phase1_hash_policy: HostFlashHashPolicy, /// records whether a change to the stage0 "active slot" has been requested pending_stage0_update: bool, @@ -188,6 +194,9 @@ impl SimSpUpdate { last_rot_update_data: None, last_host_phase1_update_data: BTreeMap::new(), phase1_hash_state: BTreeMap::new(), + phase1_hash_policy: HostFlashHashPolicy::Timer( + TIME_TO_HASH_HOST_PHASE_1, + ), pending_stage0_update: false, @@ -202,6 +211,16 @@ impl SimSpUpdate { } } + /// Instead of host phase 1 hashing completing after a few seconds, return a + /// handle that can be used to explicitly trigger completion. + pub(crate) fn set_phase1_hash_policy_explicit_control( + &mut self, + ) -> HostFlashHashCompletionSender { + let (tx, rx) = mpsc::unbounded_channel(); + self.phase1_hash_policy = HostFlashHashPolicy::Channel(rx); + HostFlashHashCompletionSender(tx) + } + pub(crate) fn sp_update_prepare( &mut self, id: UpdateId, @@ -530,8 +549,17 @@ impl SimSpUpdate { slot: u16, started: Instant, ) -> Result<[u8; 32], SpError> { - if started.elapsed() < TIME_TO_HASH_HOST_PHASE_1 { - return Err(SpError::Hf(HfError::HashInProgress)); + match &mut self.phase1_hash_policy { + HostFlashHashPolicy::Timer(duration) => { + if started.elapsed() < *duration { + return Err(SpError::Hf(HfError::HashInProgress)); + } + } + HostFlashHashPolicy::Channel(rx) => { + if rx.try_recv().is_err() { + return Err(SpError::Hf(HfError::HashInProgress)); + } + } } let data = self.last_host_phase1_update_data(slot); @@ -761,3 +789,23 @@ enum HostFlashHashState { Hashed([u8; 32]), HashInvalidated, } + +#[derive(Debug)] +enum HostFlashHashPolicy { + /// complete hashing after `Duration` has elapsed + Timer(Duration), + /// complete hashing if there's a message in this channel + Channel(mpsc::UnboundedReceiver<()>), +} + +pub struct HostFlashHashCompletionSender(mpsc::UnboundedSender<()>); + +impl HostFlashHashCompletionSender { + /// Allow the next request to get the hash result to succeed. + /// + /// Multiple calls to this function will queue multiple hash result + /// successes. + pub fn complete_next_hashing_attempt(&self) { + self.0.send(()).expect("receiving sp-sim instance is gone"); + } +}