From 2f62959e7f14b1c5dab604bc68a91fd46adc423e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 14 Jul 2025 10:36:08 -0400 Subject: [PATCH 1/7] add mgs endpoints for host flash hashing --- gateway-api/src/lib.rs | 50 +++++++++++- gateway/src/http_entrypoints.rs | 59 ++++++++++++++ openapi/gateway.json | 139 ++++++++++++++++++++++++++++++++ 3 files changed, 247 insertions(+), 1 deletion(-) diff --git a/gateway-api/src/lib.rs b/gateway-api/src/lib.rs index 8b0c5a6567b..ae6324a54e0 100644 --- a/gateway-api/src/lib.rs +++ b/gateway-api/src/lib.rs @@ -27,7 +27,7 @@ use gateway_types::{ }, }; use schemars::JsonSchema; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use uuid::Uuid; /// This endpoint is used to upload SP and ROT Hubris archives as well as phase 1 host OS @@ -259,6 +259,36 @@ pub trait GatewayApi { path: Path, ) -> Result, HttpError>; + /// Start computing the hash of a given slot of a component. + /// + /// This endpoint is only valid for the `host-boot-flash` component. + /// + /// Computing the hash takes several seconds; callers can poll for results + /// using `sp_component_hash_firmware_get()`. + #[endpoint { + method = POST, + path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}", + }] + async fn sp_component_hash_firmware_start( + rqctx: RequestContext, + path: Path, + ) -> Result; + + /// Get a computed hash of a given slot of a component. + /// + /// This endpoint is only valid for the `host-boot-flash` component. + /// + /// Computing the hash takes several seconds; callers can start hashing + /// using `sp_component_hash_firmware_start()`. + #[endpoint { + method = GET, + path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}", + }] + async fn sp_component_hash_firmware_get( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError>; + /// Abort any in-progress update an SP component /// /// Aborting an update to the SP itself is done via the component name @@ -542,6 +572,19 @@ pub struct PathSpComponent { pub component: String, } +#[derive(Deserialize, JsonSchema)] +pub struct PathSpComponentFirmwareSlot { + /// ID for the SP that the gateway service translates into the appropriate + /// port for communicating with the given SP. + #[serde(flatten)] + pub sp: SpIdentifier, + /// ID for the component of the SP; this is the internal identifier used by + /// the SP itself to identify its components. + pub component: String, + /// Firmware slot of the component. + pub firmware_slot: u16, +} + #[derive(Deserialize, JsonSchema)] pub struct PathSpTaskDumpIndex { /// ID for the SP that the gateway service translates into the appropriate @@ -615,3 +658,8 @@ pub struct PathSpIgnitionCommand { /// Ignition command to perform on the targeted SP. pub command: IgnitionCommand, } + +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct ComponentFirmwareHash { + pub sha256: [u8; 32], +} diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 9123a749930..0fc142d9f08 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -536,6 +536,65 @@ impl GatewayApi for GatewayImpl { apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } + async fn sp_component_hash_firmware_start( + rqctx: RequestContext, + path: Path, + ) -> Result { + let apictx = rqctx.context(); + + let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = + path.into_inner(); + let sp_id = sp.into(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + if component != SpComponent::HOST_CPU_BOOT_FLASH { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), + )); + } + + sp.start_host_flash_hash(firmware_slot).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await + } + + async fn sp_component_hash_firmware_get( + rqctx: RequestContext, + path: Path, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + + let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = + path.into_inner(); + let sp_id = sp.into(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + if component != SpComponent::HOST_CPU_BOOT_FLASH { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), + )); + } + + let sha256 = + sp.get_host_flash_hash(firmware_slot).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseOk(ComponentFirmwareHash { sha256 })) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await + } + async fn sp_component_update_abort( rqctx: RequestContext, path: Path, diff --git a/openapi/gateway.json b/openapi/gateway.json index 89ece0a0e73..d6e64792369 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -716,6 +716,127 @@ } } }, + "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}": { + "get": { + "summary": "Get a computed hash of a given slot of a component.", + "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; callers can start hashing using `sp_component_hash_firmware_start()`.", + "operationId": "sp_component_hash_firmware_get", + "parameters": [ + { + "in": "path", + "name": "component", + "description": "ID for the component of the SP; this is the internal identifier used by the SP itself to identify its components.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "firmware_slot", + "description": "Firmware slot of the component.", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "slot", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "type", + "required": true, + "schema": { + "$ref": "#/components/schemas/SpType" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ComponentFirmwareHash" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "post": { + "summary": "Start computing the hash of a given slot of a component.", + "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; callers can poll for results using `sp_component_hash_firmware_get()`.", + "operationId": "sp_component_hash_firmware_start", + "parameters": [ + { + "in": "path", + "name": "component", + "description": "ID for the component of the SP; this is the internal identifier used by the SP itself to identify its components.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "path", + "name": "firmware_slot", + "description": "Firmware slot of the component.", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "slot", + "required": true, + "schema": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + { + "in": "path", + "name": "type", + "required": true, + "schema": { + "$ref": "#/components/schemas/SpType" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/sp/{type}/{slot}/component/{component}/reset": { "post": { "summary": "Reset an SP component (possibly the SP itself).", @@ -1697,6 +1818,24 @@ }, "components": { "schemas": { + "ComponentFirmwareHash": { + "type": "object", + "properties": { + "sha256": { + "type": "array", + "items": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "minItems": 32, + "maxItems": 32 + } + }, + "required": [ + "sha256" + ] + }, "Duration": { "type": "object", "properties": { From f255b548b803e5d1a1cbc46d433dced9c40ccf45 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 14 Jul 2025 12:46:42 -0400 Subject: [PATCH 2/7] custom error type for hash response --- Cargo.lock | 2 + gateway-api/src/lib.rs | 5 +- gateway-types/Cargo.toml | 2 + gateway-types/src/error.rs | 56 ++++++++++++++ gateway-types/src/lib.rs | 1 + gateway/src/error.rs | 32 ++++++++ gateway/src/http_entrypoints.rs | 41 ++++++---- nexus/mgs-updates/tests/host_phase1_hash.rs | 56 ++++++++++++++ openapi/gateway.json | 86 ++++++++++++++++++++- oximeter/instruments/src/http.rs | 13 ++-- 10 files changed, 266 insertions(+), 28 deletions(-) create mode 100644 gateway-types/src/error.rs create mode 100644 nexus/mgs-updates/tests/host_phase1_hash.rs diff --git a/Cargo.lock b/Cargo.lock index 9db98afdd45..09edfee3e9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3663,6 +3663,7 @@ name = "gateway-types" version = "0.1.0" dependencies = [ "daft", + "dropshot", "gateway-messages", "hex", "ipcc", @@ -3671,6 +3672,7 @@ dependencies = [ "omicron-workspace-hack", "schemars", "serde", + "thiserror 2.0.12", "tufaceous-artifact", "uuid", ] diff --git a/gateway-api/src/lib.rs b/gateway-api/src/lib.rs index ae6324a54e0..97e01d5be4a 100644 --- a/gateway-api/src/lib.rs +++ b/gateway-api/src/lib.rs @@ -16,6 +16,7 @@ use gateway_types::{ SpState, }, component_details::SpComponentDetails, + error::ComponentFlashError, host::HostStartupOptions, ignition::{IgnitionCommand, SpIgnitionInfo}, rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState}, @@ -272,7 +273,7 @@ pub trait GatewayApi { async fn sp_component_hash_firmware_start( rqctx: RequestContext, path: Path, - ) -> Result; + ) -> Result; /// Get a computed hash of a given slot of a component. /// @@ -287,7 +288,7 @@ pub trait GatewayApi { async fn sp_component_hash_firmware_get( rqctx: RequestContext, path: Path, - ) -> Result, HttpError>; + ) -> Result, ComponentFlashError>; /// Abort any in-progress update an SP component /// diff --git a/gateway-types/Cargo.toml b/gateway-types/Cargo.toml index 406f51c36e8..ffead9c834e 100644 --- a/gateway-types/Cargo.toml +++ b/gateway-types/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [dependencies] daft.workspace = true +dropshot.workspace = true gateway-messages.workspace = true # Avoid depending on gateway-sp-comms! It is a pretty heavy dependency and # would only be used for From impls anyway. We put those impls in @@ -20,5 +21,6 @@ omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true schemars.workspace = true serde.workspace = true +thiserror.workspace = true tufaceous-artifact.workspace = true uuid.workspace = true diff --git a/gateway-types/src/error.rs b/gateway-types/src/error.rs new file mode 100644 index 00000000000..e50c16d3dfc --- /dev/null +++ b/gateway-types/src/error.rs @@ -0,0 +1,56 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use dropshot::ErrorStatusCode; +use dropshot::HttpError; +use dropshot::HttpResponseError; +use schemars::JsonSchema; +use serde::Serialize; + +#[derive(Debug, Serialize, JsonSchema, thiserror::Error)] +#[serde(tag = "state", rename_all = "snake_case")] +pub enum ComponentFlashError { + #[error("hash needs to be calculated")] + HashUncalculated, + #[error("hash needs to be recalculated")] + HashStale, + #[error("hash is currently being calculated; try again shortly")] + HashInProgress, + #[error("{internal_message}")] + Other { + message: String, + error_code: Option, + + // Skip serializing these fields, as they are used for the + // `fmt::Display` implementation and for determining the status + // code, respectively, rather than included in the response body: + #[serde(skip)] + internal_message: String, + #[serde(skip)] + status: ErrorStatusCode, + }, +} + +impl HttpResponseError for ComponentFlashError { + fn status_code(&self) -> ErrorStatusCode { + match self { + Self::HashUncalculated | Self::HashStale => { + ErrorStatusCode::BAD_REQUEST + } + Self::HashInProgress => ErrorStatusCode::SERVICE_UNAVAILABLE, + ComponentFlashError::Other { status, .. } => *status, + } + } +} + +impl From for ComponentFlashError { + fn from(error: HttpError) -> Self { + Self::Other { + message: error.external_message, + internal_message: error.internal_message, + status: error.status_code, + error_code: error.error_code, + } + } +} diff --git a/gateway-types/src/lib.rs b/gateway-types/src/lib.rs index 149cd1d8f04..5c0ded9ada3 100644 --- a/gateway-types/src/lib.rs +++ b/gateway-types/src/lib.rs @@ -7,6 +7,7 @@ pub mod caboose; pub mod component; pub mod component_details; +pub mod error; pub mod host; pub mod ignition; pub mod rot; diff --git a/gateway/src/error.rs b/gateway/src/error.rs index a78f3599129..500391119bf 100644 --- a/gateway/src/error.rs +++ b/gateway/src/error.rs @@ -6,10 +6,12 @@ use crate::management_switch::SpIdentifier; use dropshot::HttpError; +use gateway_messages::HfError; use gateway_messages::SpError; use gateway_sp_comms::BindError; pub use gateway_sp_comms::error::CommunicationError; use gateway_sp_comms::error::UpdateError; +use gateway_types::error::ComponentFlashError; use slog_error_chain::InlineErrorChain; use slog_error_chain::SlogInlineError; use std::error::Error; @@ -232,3 +234,33 @@ pub(crate) fn http_err_with_message( headers: None, } } + +/// Convert a `CommunicationError` with a given `sp` into a +/// `ComponentFlashError`. +pub(crate) fn map_component_flash_error( + sp: SpIdentifier, + err: CommunicationError, +) -> ComponentFlashError { + if let CommunicationError::SpError(err) = &err { + if let SpError::Hf(err) = err { + match err { + HfError::HashUncalculated => { + return ComponentFlashError::HashUncalculated; + } + HfError::RecalculateHash => { + return ComponentFlashError::HashStale; + } + HfError::HashInProgress => { + return ComponentFlashError::HashInProgress; + } + HfError::NotMuxedToSp + | HfError::BadAddress + | HfError::QspiTimeout + | HfError::QspiTransferError => (), + } + } + } + ComponentFlashError::from(HttpError::from( + SpCommsError::SpCommunicationFailed { sp, err }, + )) +} diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 0fc142d9f08..e8f1223bb25 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -9,6 +9,7 @@ use crate::ServerContext; use crate::error::SpCommsError; use crate::http_err_with_message; +use crate::map_component_flash_error; use base64::Engine; use dropshot::ApiDescription; use dropshot::HttpError; @@ -36,6 +37,7 @@ use gateway_types::component::SpComponentList; use gateway_types::component::SpIdentifier; use gateway_types::component::SpState; use gateway_types::component_details::SpComponentDetails; +use gateway_types::error::ComponentFlashError; use gateway_types::host::HostStartupOptions; use gateway_types::ignition::SpIgnitionInfo; use gateway_types::rot::RotCfpa; @@ -539,26 +541,28 @@ impl GatewayApi for GatewayImpl { async fn sp_component_hash_firmware_start( rqctx: RequestContext, path: Path, - ) -> Result { + ) -> Result { let apictx = rqctx.context(); let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = path.into_inner(); let sp_id = sp.into(); let handler = async { - let sp = apictx.mgmt_switch.sp(sp_id)?; + let sp = apictx.mgmt_switch.sp(sp_id).map_err(HttpError::from)?; let component = component_from_str(&component)?; if component != SpComponent::HOST_CPU_BOOT_FLASH { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the host boot flash can be hashed".into(), + return Err(ComponentFlashError::from( + HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), + ), )); } - sp.start_host_flash_hash(firmware_slot).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + sp.start_host_flash_hash(firmware_slot) + .await + .map_err(|err| map_component_flash_error(sp_id, err))?; Ok(HttpResponseUpdatedNoContent()) }; @@ -568,27 +572,30 @@ impl GatewayApi for GatewayImpl { async fn sp_component_hash_firmware_get( rqctx: RequestContext, path: Path, - ) -> Result, HttpError> { + ) -> Result, ComponentFlashError> + { let apictx = rqctx.context(); let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = path.into_inner(); let sp_id = sp.into(); let handler = async { - let sp = apictx.mgmt_switch.sp(sp_id)?; + let sp = apictx.mgmt_switch.sp(sp_id).map_err(HttpError::from)?; let component = component_from_str(&component)?; if component != SpComponent::HOST_CPU_BOOT_FLASH { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the host boot flash can be hashed".into(), + return Err(ComponentFlashError::from( + HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), + ), )); } - let sha256 = - sp.get_host_flash_hash(firmware_slot).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let sha256 = sp + .get_host_flash_hash(firmware_slot) + .await + .map_err(|err| map_component_flash_error(sp_id, err))?; Ok(HttpResponseOk(ComponentFirmwareHash { sha256 })) }; diff --git a/nexus/mgs-updates/tests/host_phase1_hash.rs b/nexus/mgs-updates/tests/host_phase1_hash.rs new file mode 100644 index 00000000000..348828557b3 --- /dev/null +++ b/nexus/mgs-updates/tests/host_phase1_hash.rs @@ -0,0 +1,56 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Test host phase 1 hash flashing via MGS. +//! +//! This operation is implemented asynchronously on the SP side: we must first +//! ask it to start hashing, then we'll get "still hashing" errors for a few +//! seconds, then we'll get the hash result. + +use gateway_client::Error; +use gateway_client::SpComponent; +use gateway_client::types::ComponentFlashError; +use gateway_client::types::SpType; +use gateway_messages::SpPort; +use gateway_test_utils::setup as mgs_setup; +use slog_error_chain::InlineErrorChain; + +#[tokio::test] +async fn test_host_phase1_hashing() { + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_host_phase1_updater_updates_sled", + SpPort::One, + ) + .await; + + // We'll only talk to one sp-sim for this test. + let mgs_client = mgstestctx.client(); + let sp_type = SpType::Sled; + let sp_component = SpComponent::HOST_CPU_BOOT_FLASH.const_as_str(); + let sp_slot = 0; + + // We haven't yet started hashing; we should get the error we expect for + // both slots. + for firmware_slot in [0, 1] { + match mgs_client + .sp_component_hash_firmware_get( + sp_type, + sp_slot, + sp_component, + firmware_slot, + ) + .await + { + Err(Error::ErrorResponse(err)) => match err.into_inner() { + ComponentFlashError::HashUncalculated => (), + other => panic!("unexpected error response: {other:?}"), + }, + Err(err) => { + panic!("unexpected error: {}", InlineErrorChain::new(&err)) + } + Ok(resp) => panic!("unexpected success: {resp:?}"), + } + } +} diff --git a/openapi/gateway.json b/openapi/gateway.json index d6e64792369..ed5db888cd5 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -773,10 +773,10 @@ } }, "4XX": { - "$ref": "#/components/responses/Error" + "$ref": "#/components/responses/ComponentFlashError" }, "5XX": { - "$ref": "#/components/responses/Error" + "$ref": "#/components/responses/ComponentFlashError" } } }, @@ -829,10 +829,10 @@ "description": "resource updated" }, "4XX": { - "$ref": "#/components/responses/Error" + "$ref": "#/components/responses/ComponentFlashError" }, "5XX": { - "$ref": "#/components/responses/Error" + "$ref": "#/components/responses/ComponentFlashError" } } } @@ -1836,6 +1836,74 @@ "sha256" ] }, + "ComponentFlashError": { + "oneOf": [ + { + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "hash_uncalculated" + ] + } + }, + "required": [ + "state" + ] + }, + { + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "hash_stale" + ] + } + }, + "required": [ + "state" + ] + }, + { + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "hash_in_progress" + ] + } + }, + "required": [ + "state" + ] + }, + { + "type": "object", + "properties": { + "error_code": { + "nullable": true, + "type": "string" + }, + "message": { + "type": "string" + }, + "state": { + "type": "string", + "enum": [ + "other" + ] + } + }, + "required": [ + "message", + "state" + ] + } + ] + }, "Duration": { "type": "object", "properties": { @@ -3914,6 +3982,16 @@ } } } + }, + "ComponentFlashError": { + "description": "ComponentFlashError", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ComponentFlashError" + } + } + } } } } diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs index 74e569b30cc..c30a96a7850 100644 --- a/oximeter/instruments/src/http.rs +++ b/oximeter/instruments/src/http.rs @@ -6,7 +6,9 @@ // Copyright 2024 Oxide Computer Company -use dropshot::{HttpError, HttpResponse, RequestContext, ServerContext}; +use dropshot::{ + HttpResponse, HttpResponseError, RequestContext, ServerContext, +}; use futures::Future; use http::StatusCode; use oximeter::{ @@ -156,14 +158,15 @@ impl LatencyTracker { /// produces an expected `dropshot` response. This method runs and times the handler, records /// the latency in the appropriate timeseries, and forwards the result of the handler to the /// caller. - pub async fn instrument_dropshot_handler( + pub async fn instrument_dropshot_handler( &self, context: &RequestContext, handler: H, - ) -> Result + ) -> Result where R: HttpResponse, - H: Future>, + E: HttpResponseError, + H: Future>, T: ServerContext, { let start = Instant::now(); @@ -171,7 +174,7 @@ impl LatencyTracker { let latency = start.elapsed(); let status_code = match &result { Ok(response) => response.status_code(), - Err(ref e) => e.status_code.as_status(), + Err(ref e) => e.status_code().as_status(), }; if let Err(e) = self.update(&context.endpoint.operation_id, status_code, latency) From 5abd800da222987c6290f98e1bebadf310490c65 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 14 Jul 2025 14:10:55 -0400 Subject: [PATCH 3/7] return hash status instead of custom errors --- Cargo.lock | 2 - gateway-api/src/lib.rs | 14 +--- gateway-types/Cargo.toml | 2 - gateway-types/src/error.rs | 56 ------------- gateway-types/src/host.rs | 15 ++++ gateway-types/src/lib.rs | 1 - gateway/src/error.rs | 32 -------- gateway/src/http_entrypoints.rs | 66 +++++++++------ nexus/mgs-updates/tests/host_phase1_hash.rs | 19 ++--- openapi/gateway.json | 90 +++++++-------------- 10 files changed, 93 insertions(+), 204 deletions(-) delete mode 100644 gateway-types/src/error.rs diff --git a/Cargo.lock b/Cargo.lock index 09edfee3e9a..9db98afdd45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3663,7 +3663,6 @@ name = "gateway-types" version = "0.1.0" dependencies = [ "daft", - "dropshot", "gateway-messages", "hex", "ipcc", @@ -3672,7 +3671,6 @@ dependencies = [ "omicron-workspace-hack", "schemars", "serde", - "thiserror 2.0.12", "tufaceous-artifact", "uuid", ] diff --git a/gateway-api/src/lib.rs b/gateway-api/src/lib.rs index 97e01d5be4a..4b9ae62cfce 100644 --- a/gateway-api/src/lib.rs +++ b/gateway-api/src/lib.rs @@ -16,8 +16,7 @@ use gateway_types::{ SpState, }, component_details::SpComponentDetails, - error::ComponentFlashError, - host::HostStartupOptions, + host::{ComponentFirmwareHashStatus, HostStartupOptions}, ignition::{IgnitionCommand, SpIgnitionInfo}, rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState}, sensor::SpSensorReading, @@ -28,7 +27,7 @@ use gateway_types::{ }, }; use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use uuid::Uuid; /// This endpoint is used to upload SP and ROT Hubris archives as well as phase 1 host OS @@ -273,7 +272,7 @@ pub trait GatewayApi { async fn sp_component_hash_firmware_start( rqctx: RequestContext, path: Path, - ) -> Result; + ) -> Result; /// Get a computed hash of a given slot of a component. /// @@ -288,7 +287,7 @@ pub trait GatewayApi { async fn sp_component_hash_firmware_get( rqctx: RequestContext, path: Path, - ) -> Result, ComponentFlashError>; + ) -> Result, HttpError>; /// Abort any in-progress update an SP component /// @@ -659,8 +658,3 @@ pub struct PathSpIgnitionCommand { /// Ignition command to perform on the targeted SP. pub command: IgnitionCommand, } - -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct ComponentFirmwareHash { - pub sha256: [u8; 32], -} diff --git a/gateway-types/Cargo.toml b/gateway-types/Cargo.toml index ffead9c834e..406f51c36e8 100644 --- a/gateway-types/Cargo.toml +++ b/gateway-types/Cargo.toml @@ -9,7 +9,6 @@ workspace = true [dependencies] daft.workspace = true -dropshot.workspace = true gateway-messages.workspace = true # Avoid depending on gateway-sp-comms! It is a pretty heavy dependency and # would only be used for From impls anyway. We put those impls in @@ -21,6 +20,5 @@ omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true schemars.workspace = true serde.workspace = true -thiserror.workspace = true tufaceous-artifact.workspace = true uuid.workspace = true diff --git a/gateway-types/src/error.rs b/gateway-types/src/error.rs deleted file mode 100644 index e50c16d3dfc..00000000000 --- a/gateway-types/src/error.rs +++ /dev/null @@ -1,56 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use dropshot::ErrorStatusCode; -use dropshot::HttpError; -use dropshot::HttpResponseError; -use schemars::JsonSchema; -use serde::Serialize; - -#[derive(Debug, Serialize, JsonSchema, thiserror::Error)] -#[serde(tag = "state", rename_all = "snake_case")] -pub enum ComponentFlashError { - #[error("hash needs to be calculated")] - HashUncalculated, - #[error("hash needs to be recalculated")] - HashStale, - #[error("hash is currently being calculated; try again shortly")] - HashInProgress, - #[error("{internal_message}")] - Other { - message: String, - error_code: Option, - - // Skip serializing these fields, as they are used for the - // `fmt::Display` implementation and for determining the status - // code, respectively, rather than included in the response body: - #[serde(skip)] - internal_message: String, - #[serde(skip)] - status: ErrorStatusCode, - }, -} - -impl HttpResponseError for ComponentFlashError { - fn status_code(&self) -> ErrorStatusCode { - match self { - Self::HashUncalculated | Self::HashStale => { - ErrorStatusCode::BAD_REQUEST - } - Self::HashInProgress => ErrorStatusCode::SERVICE_UNAVAILABLE, - ComponentFlashError::Other { status, .. } => *status, - } - } -} - -impl From for ComponentFlashError { - fn from(error: HttpError) -> Self { - Self::Other { - message: error.external_message, - internal_message: error.internal_message, - status: error.status_code, - error_code: error.error_code, - } - } -} diff --git a/gateway-types/src/host.rs b/gateway-types/src/host.rs index 73130b0a1aa..c6bf9d30920 100644 --- a/gateway-types/src/host.rs +++ b/gateway-types/src/host.rs @@ -56,3 +56,18 @@ impl From for HostStartupOptions { } } } + +#[derive(Serialize, Deserialize, JsonSchema)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum ComponentFirmwareHashStatus { + /// The hash is not available; the client must issue a separate request to + /// begin calculating the hash. + HashNotCalculated, + /// The hash is currently being calculated; the client should sleep briefly + /// then check again. + /// + /// We expect this operation to take a handful of seconds in practice. + HashInProgress, + /// The hash of the given firmware slot. + Hashed { sha256: [u8; 32] }, +} diff --git a/gateway-types/src/lib.rs b/gateway-types/src/lib.rs index 5c0ded9ada3..149cd1d8f04 100644 --- a/gateway-types/src/lib.rs +++ b/gateway-types/src/lib.rs @@ -7,7 +7,6 @@ pub mod caboose; pub mod component; pub mod component_details; -pub mod error; pub mod host; pub mod ignition; pub mod rot; diff --git a/gateway/src/error.rs b/gateway/src/error.rs index 500391119bf..a78f3599129 100644 --- a/gateway/src/error.rs +++ b/gateway/src/error.rs @@ -6,12 +6,10 @@ use crate::management_switch::SpIdentifier; use dropshot::HttpError; -use gateway_messages::HfError; use gateway_messages::SpError; use gateway_sp_comms::BindError; pub use gateway_sp_comms::error::CommunicationError; use gateway_sp_comms::error::UpdateError; -use gateway_types::error::ComponentFlashError; use slog_error_chain::InlineErrorChain; use slog_error_chain::SlogInlineError; use std::error::Error; @@ -234,33 +232,3 @@ pub(crate) fn http_err_with_message( headers: None, } } - -/// Convert a `CommunicationError` with a given `sp` into a -/// `ComponentFlashError`. -pub(crate) fn map_component_flash_error( - sp: SpIdentifier, - err: CommunicationError, -) -> ComponentFlashError { - if let CommunicationError::SpError(err) = &err { - if let SpError::Hf(err) = err { - match err { - HfError::HashUncalculated => { - return ComponentFlashError::HashUncalculated; - } - HfError::RecalculateHash => { - return ComponentFlashError::HashStale; - } - HfError::HashInProgress => { - return ComponentFlashError::HashInProgress; - } - HfError::NotMuxedToSp - | HfError::BadAddress - | HfError::QspiTimeout - | HfError::QspiTransferError => (), - } - } - } - ComponentFlashError::from(HttpError::from( - SpCommsError::SpCommunicationFailed { sp, err }, - )) -} diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index e8f1223bb25..b5ef3f8e8b2 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -9,7 +9,6 @@ use crate::ServerContext; use crate::error::SpCommsError; use crate::http_err_with_message; -use crate::map_component_flash_error; use base64::Engine; use dropshot::ApiDescription; use dropshot::HttpError; @@ -24,8 +23,10 @@ use dropshot::WebsocketEndpointResult; use dropshot::WebsocketUpgrade; use futures::TryFutureExt; use gateway_api::*; +use gateway_messages::HfError; use gateway_messages::RotBootInfo; use gateway_messages::SpComponent; +use gateway_messages::SpError; use gateway_sp_comms::HostPhase2Provider; use gateway_sp_comms::VersionedSpState; use gateway_sp_comms::error::CommunicationError; @@ -37,7 +38,7 @@ use gateway_types::component::SpComponentList; use gateway_types::component::SpIdentifier; use gateway_types::component::SpState; use gateway_types::component_details::SpComponentDetails; -use gateway_types::error::ComponentFlashError; +use gateway_types::host::ComponentFirmwareHashStatus; use gateway_types::host::HostStartupOptions; use gateway_types::ignition::SpIgnitionInfo; use gateway_types::rot::RotCfpa; @@ -541,28 +542,27 @@ impl GatewayApi for GatewayImpl { async fn sp_component_hash_firmware_start( rqctx: RequestContext, path: Path, - ) -> Result { + ) -> Result { let apictx = rqctx.context(); let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = path.into_inner(); let sp_id = sp.into(); let handler = async { - let sp = apictx.mgmt_switch.sp(sp_id).map_err(HttpError::from)?; + let sp = apictx.mgmt_switch.sp(sp_id)?; let component = component_from_str(&component)?; if component != SpComponent::HOST_CPU_BOOT_FLASH { - return Err(ComponentFlashError::from( - HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the host boot flash can be hashed".into(), - ), + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), )); } - sp.start_host_flash_hash(firmware_slot) - .await - .map_err(|err| map_component_flash_error(sp_id, err))?; + // TODO-john catch hash in progress? + sp.start_host_flash_hash(firmware_slot).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; Ok(HttpResponseUpdatedNoContent()) }; @@ -572,32 +572,48 @@ impl GatewayApi for GatewayImpl { async fn sp_component_hash_firmware_get( rqctx: RequestContext, path: Path, - ) -> Result, ComponentFlashError> - { + ) -> Result, HttpError> { let apictx = rqctx.context(); let PathSpComponentFirmwareSlot { sp, component, firmware_slot } = path.into_inner(); let sp_id = sp.into(); let handler = async { - let sp = apictx.mgmt_switch.sp(sp_id).map_err(HttpError::from)?; + let sp = apictx.mgmt_switch.sp(sp_id)?; let component = component_from_str(&component)?; if component != SpComponent::HOST_CPU_BOOT_FLASH { - return Err(ComponentFlashError::from( - HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the host boot flash can be hashed".into(), - ), + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the host boot flash can be hashed".into(), )); } - let sha256 = sp - .get_host_flash_hash(firmware_slot) - .await - .map_err(|err| map_component_flash_error(sp_id, err))?; + let status = match sp.get_host_flash_hash(firmware_slot).await { + // success + Ok(sha256) => ComponentFirmwareHashStatus::Hashed { sha256 }, + + // expected failure: hash needs to be calculated (or + // recalculated; either way the client operation is the same) + Err(CommunicationError::SpError(SpError::Hf( + HfError::HashUncalculated | HfError::RecalculateHash, + ))) => ComponentFirmwareHashStatus::HashNotCalculated, + + // expected failure: hashing is currently in progress; client + // needs to wait and try again later + Err(CommunicationError::SpError(SpError::Hf( + HfError::HashInProgress, + ))) => ComponentFirmwareHashStatus::HashInProgress, + + // other errors are failures + Err(err) => { + return Err(HttpError::from( + SpCommsError::SpCommunicationFailed { sp: sp_id, err }, + )); + } + }; - Ok(HttpResponseOk(ComponentFirmwareHash { sha256 })) + Ok(HttpResponseOk(status)) }; apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } diff --git a/nexus/mgs-updates/tests/host_phase1_hash.rs b/nexus/mgs-updates/tests/host_phase1_hash.rs index 348828557b3..76a3329da85 100644 --- a/nexus/mgs-updates/tests/host_phase1_hash.rs +++ b/nexus/mgs-updates/tests/host_phase1_hash.rs @@ -8,13 +8,11 @@ //! ask it to start hashing, then we'll get "still hashing" errors for a few //! seconds, then we'll get the hash result. -use gateway_client::Error; use gateway_client::SpComponent; -use gateway_client::types::ComponentFlashError; +use gateway_client::types::ComponentFirmwareHashStatus; use gateway_client::types::SpType; use gateway_messages::SpPort; use gateway_test_utils::setup as mgs_setup; -use slog_error_chain::InlineErrorChain; #[tokio::test] async fn test_host_phase1_hashing() { @@ -34,7 +32,7 @@ async fn test_host_phase1_hashing() { // We haven't yet started hashing; we should get the error we expect for // both slots. for firmware_slot in [0, 1] { - match mgs_client + let status = mgs_client .sp_component_hash_firmware_get( sp_type, sp_slot, @@ -42,15 +40,10 @@ async fn test_host_phase1_hashing() { firmware_slot, ) .await - { - Err(Error::ErrorResponse(err)) => match err.into_inner() { - ComponentFlashError::HashUncalculated => (), - other => panic!("unexpected error response: {other:?}"), - }, - Err(err) => { - panic!("unexpected error: {}", InlineErrorChain::new(&err)) - } - Ok(resp) => panic!("unexpected success: {resp:?}"), + .expect("got firmware hash status"); + match status.into_inner() { + ComponentFirmwareHashStatus::HashNotCalculated => (), + other => panic!("unexpected status: {other:?}"), } } } diff --git a/openapi/gateway.json b/openapi/gateway.json index ed5db888cd5..c6fd494c0f1 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -767,16 +767,16 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ComponentFirmwareHash" + "$ref": "#/components/schemas/ComponentFirmwareHashStatus" } } } }, "4XX": { - "$ref": "#/components/responses/ComponentFlashError" + "$ref": "#/components/responses/Error" }, "5XX": { - "$ref": "#/components/responses/ComponentFlashError" + "$ref": "#/components/responses/Error" } } }, @@ -829,10 +829,10 @@ "description": "resource updated" }, "4XX": { - "$ref": "#/components/responses/ComponentFlashError" + "$ref": "#/components/responses/Error" }, "5XX": { - "$ref": "#/components/responses/ComponentFlashError" + "$ref": "#/components/responses/Error" } } } @@ -1818,58 +1818,28 @@ }, "components": { "schemas": { - "ComponentFirmwareHash": { - "type": "object", - "properties": { - "sha256": { - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0 - }, - "minItems": 32, - "maxItems": 32 - } - }, - "required": [ - "sha256" - ] - }, - "ComponentFlashError": { + "ComponentFirmwareHashStatus": { "oneOf": [ { + "description": "The hash is not available; the client must issue a separate request to begin calculating the hash.", "type": "object", "properties": { - "state": { - "type": "string", - "enum": [ - "hash_uncalculated" - ] - } - }, - "required": [ - "state" - ] - }, - { - "type": "object", - "properties": { - "state": { + "status": { "type": "string", "enum": [ - "hash_stale" + "hash_not_calculated" ] } }, "required": [ - "state" + "status" ] }, { + "description": "The hash is currently being calculated; the client should sleep briefly then check again.\n\nWe expect this operation to take a handful of seconds in practice.", "type": "object", "properties": { - "state": { + "status": { "type": "string", "enum": [ "hash_in_progress" @@ -1877,29 +1847,33 @@ } }, "required": [ - "state" + "status" ] }, { + "description": "The hash of the given firmware slot.", "type": "object", "properties": { - "error_code": { - "nullable": true, - "type": "string" - }, - "message": { - "type": "string" + "sha256": { + "type": "array", + "items": { + "type": "integer", + "format": "uint8", + "minimum": 0 + }, + "minItems": 32, + "maxItems": 32 }, - "state": { + "status": { "type": "string", "enum": [ - "other" + "hashed" ] } }, "required": [ - "message", - "state" + "sha256", + "status" ] } ] @@ -3982,16 +3956,6 @@ } } } - }, - "ComponentFlashError": { - "description": "ComponentFlashError", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ComponentFlashError" - } - } - } } } } From 1075fe9b739c319e2e143c132b39514f04872087 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 14 Jul 2025 16:38:33 -0400 Subject: [PATCH 4/7] flesh out test_host_phase1_hashing() --- Cargo.lock | 1 + clients/gateway-client/src/lib.rs | 1 + gateway/src/http_entrypoints.rs | 20 +- nexus/mgs-updates/Cargo.toml | 1 + nexus/mgs-updates/tests/host_phase1_hash.rs | 204 ++++++++++++++++++++ sp-sim/src/gimlet.rs | 20 ++ sp-sim/src/lib.rs | 1 + sp-sim/src/update.rs | 52 ++++- 8 files changed, 292 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9db98afdd45..d8f2ad0fc80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6509,6 +6509,7 @@ dependencies = [ "internal-dns-types", "nexus-types", "omicron-common", + "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", "qorb", diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index c4b16790362..8219f7d34b2 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -60,6 +60,7 @@ progenitor::generate_api!( }), derives = [schemars::JsonSchema], patch = { + ComponentFirmwareHashStatus = { derives = [PartialEq, Eq, PartialOrd, Ord] }, HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] }, ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] }, RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] }, diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index b5ef3f8e8b2..d45b738fba7 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -559,12 +559,20 @@ impl GatewayApi for GatewayImpl { )); } - // TODO-john catch hash in progress? - sp.start_host_flash_hash(firmware_slot).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; - - Ok(HttpResponseUpdatedNoContent()) + // The SP (reasonably!) returns a `HashInProgress` error if we try + // to start hashing while hashing is being calculated, but we're + // presenting an idempotent "start hashing if it isn't started" + // endpoint instead. Swallow that error. + match sp.start_host_flash_hash(firmware_slot).await { + Ok(()) + | Err(CommunicationError::SpError(SpError::Hf( + HfError::HashInProgress, + ))) => Ok(HttpResponseUpdatedNoContent()), + Err(err) => { + Err(SpCommsError::SpCommunicationFailed { sp: sp_id, err } + .into()) + } + } }; apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } diff --git a/nexus/mgs-updates/Cargo.toml b/nexus/mgs-updates/Cargo.toml index 9c47c14cf47..df65ddd07f7 100644 --- a/nexus/mgs-updates/Cargo.toml +++ b/nexus/mgs-updates/Cargo.toml @@ -41,6 +41,7 @@ dropshot.workspace = true gateway-messages.workspace = true gateway-test-utils.workspace = true hubtools.workspace = true +omicron-test-utils.workspace = true rand.workspace = true repo-depot-api.workspace = true sp-sim.workspace = true diff --git a/nexus/mgs-updates/tests/host_phase1_hash.rs b/nexus/mgs-updates/tests/host_phase1_hash.rs index 76a3329da85..87c5077ae31 100644 --- a/nexus/mgs-updates/tests/host_phase1_hash.rs +++ b/nexus/mgs-updates/tests/host_phase1_hash.rs @@ -8,12 +8,55 @@ //! ask it to start hashing, then we'll get "still hashing" errors for a few //! seconds, then we'll get the hash result. +use gateway_client::Client; use gateway_client::SpComponent; use gateway_client::types::ComponentFirmwareHashStatus; use gateway_client::types::SpType; +use gateway_client::types::SpUpdateStatus; use gateway_messages::SpPort; use gateway_test_utils::setup as mgs_setup; +use omicron_test_utils::dev::poll::CondCheckError; +use omicron_test_utils::dev::poll::wait_for_condition; +use sha2::Digest as _; +use sha2::Sha256; +use sp_sim::SimulatedSp; +use std::time::Duration; +use uuid::Uuid; +struct Phase1HashStatusChecker<'a> { + mgs_client: &'a Client, + sp_type: SpType, + sp_slot: u16, + sp_component: &'a str, +} + +impl Phase1HashStatusChecker<'_> { + async fn assert_status( + &self, + expected: &[(u16, ComponentFirmwareHashStatus)], + ) { + for (firmware_slot, expected_status) in expected { + let status = self + .mgs_client + .sp_component_hash_firmware_get( + self.sp_type, + self.sp_slot, + self.sp_component, + *firmware_slot, + ) + .await + .expect("got firmware hash status"); + assert_eq!( + status.into_inner(), + *expected_status, + "unexpected status for slot {firmware_slot}" + ); + } + } +} + +// This is primarily a test of the `sp-sim` implementation of host phase 1 +// flashing, with a minor side test that MGS's endpoints wrap it faithfully. #[tokio::test] async fn test_host_phase1_hashing() { // Start MGS + Sim SP. @@ -25,9 +68,16 @@ async fn test_host_phase1_hashing() { // We'll only talk to one sp-sim for this test. let mgs_client = mgstestctx.client(); + let sp_sim = &mgstestctx.simrack.gimlets[0]; let sp_type = SpType::Sled; let sp_component = SpComponent::HOST_CPU_BOOT_FLASH.const_as_str(); let sp_slot = 0; + let phase1_checker = Phase1HashStatusChecker { + mgs_client: &mgs_client, + sp_type, + sp_slot, + sp_component, + }; // We haven't yet started hashing; we should get the error we expect for // both slots. @@ -46,4 +96,158 @@ async fn test_host_phase1_hashing() { other => panic!("unexpected status: {other:?}"), } } + + // We want explicit (i.e., not-timer-based) control over when hashing + // completes. + let hashing_complete_sender = + sp_sim.set_phase1_hash_policy_explicit_control().await; + + // Start hashing firmware slot 0. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 0) + .await + .expect("started firmware hashing"); + + // We should see the expected status; hash is computing in slot 0 and not + // yet started in slot 1. + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::HashInProgress), + (1, ComponentFirmwareHashStatus::HashNotCalculated), + ]) + .await; + + // We can start hashing firmware slot 0 again; this should be a no-op while + // hashing is being done. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 0) + .await + .expect("starting hashing while hashing should be okay"); + + // Calculate the hash we expect to see. + let expected_sha256_0 = Sha256::digest( + sp_sim.last_host_phase1_update_data(0).await.as_deref().unwrap_or(&[]), + ) + .into(); + + // Allow the next `get()` to succeed. + hashing_complete_sender.complete_next_hashing_attempt(); + + // We should see the expected status; hash is complete in slot 0 and not + // yet started in slot 1. + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::HashNotCalculated), + ]) + .await; + + // Repeat, but slot 1. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 1) + .await + .expect("started firmware hashing"); + hashing_complete_sender.complete_next_hashing_attempt(); + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + ]) + .await; + + // Upload a new, fake phase1 to slot 1. + let fake_phase1 = b"test_host_phase1_hashing_fake_data".as_slice(); + let expected_sha256_1 = Sha256::digest(fake_phase1).into(); + + // Drive the update to completion. + { + let update_id = Uuid::new_v4(); + mgs_client + .sp_component_update( + sp_type, + sp_slot, + sp_component, + 1, + &update_id, + fake_phase1, + ) + .await + .expect("started slot 1 update"); + wait_for_condition( + || async { + let update_status = mgs_client + .sp_component_update_status(sp_type, sp_slot, sp_component) + .await + .expect("got update status") + .into_inner(); + match update_status { + // expected terminal state + SpUpdateStatus::Complete { id } => { + if id == update_id { + Ok(()) + } else { + Err(CondCheckError::Failed(format!( + "unexpected complete ID \ + (got {id} expected {update_id})" + ))) + } + } + + // expected intermediate states + SpUpdateStatus::Preparing { .. } + | SpUpdateStatus::InProgress { .. } => { + Err(CondCheckError::NotYet) + } + + // never-expect-to-see states + SpUpdateStatus::None + | SpUpdateStatus::Aborted { .. } + | SpUpdateStatus::Failed { .. } + | SpUpdateStatus::RotError { .. } => { + Err(CondCheckError::Failed(format!( + "unexpected status: {update_status:?}" + ))) + } + } + }, + &Duration::from_millis(100), + &Duration::from_secs(30), + ) + .await + .expect("update to sp-sim completed within timeout"); + } + + // Confirm the simulator wrote the expected data in slot 1. + let slot_1_data = sp_sim.last_host_phase1_update_data(1).await.unwrap(); + assert_eq!(*slot_1_data, *fake_phase1); + + // Writing an update should have put slot 1 back into the "needs hashing" + // state. + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::HashNotCalculated), + ]) + .await; + + // Start hashing firmware slot 1. + mgs_client + .sp_component_hash_firmware_start(sp_type, sp_slot, sp_component, 1) + .await + .expect("started firmware hashing"); + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::HashInProgress), + ]) + .await; + + // Allow hashing to complete. + hashing_complete_sender.complete_next_hashing_attempt(); + phase1_checker + .assert_status(&[ + (0, ComponentFirmwareHashStatus::Hashed(expected_sha256_0)), + (1, ComponentFirmwareHashStatus::Hashed(expected_sha256_1)), + ]) + .await; } diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index 8d21fc4cb67..4dc2e19d20d 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::HostFlashHashCompletionSender; use crate::Responsiveness; use crate::SimulatedSp; use crate::config::GimletConfig; @@ -404,6 +405,25 @@ impl Gimlet { pub fn last_request_handled(&self) -> Option { *self.last_request_handled.lock().unwrap() } + + /// Instead of host phase 1 hashing completing after a few seconds, return a + /// handle that can be used to explicitly trigger completion. + /// + /// # Panics + /// + /// Panics if this `Gimlet` was created with only an RoT instead of a full + /// SP + RoT complex. + pub async fn set_phase1_hash_policy_explicit_control( + &self, + ) -> HostFlashHashCompletionSender { + self.handler + .as_ref() + .expect("gimlet was created with SP config") + .lock() + .await + .update_state + .set_phase1_hash_policy_explicit_control() + } } struct SerialConsoleTcpTask { diff --git a/sp-sim/src/lib.rs b/sp-sim/src/lib.rs index d33444164ec..d439f9477ff 100644 --- a/sp-sim/src/lib.rs +++ b/sp-sim/src/lib.rs @@ -26,6 +26,7 @@ pub use slog::Logger; use std::net::SocketAddrV6; use tokio::sync::mpsc; use tokio::sync::watch; +pub use update::HostFlashHashCompletionSender; pub const SIM_ROT_BOARD: &str = "SimRot"; pub const SIM_ROT_STAGE0_BOARD: &str = "SimRotStage0"; diff --git a/sp-sim/src/update.rs b/sp-sim/src/update.rs index de13a2c2106..032f1491c5d 100644 --- a/sp-sim/src/update.rs +++ b/sp-sim/src/update.rs @@ -27,6 +27,7 @@ use hubtools::RawHubrisImage; use sha2::Sha256; use sha3::Digest; use sha3::Sha3_256; +use tokio::sync::mpsc; // How long do we take to hash host flash? Real SPs take a handful of seconds; // we'll pick something similar. @@ -48,6 +49,11 @@ pub(crate) struct SimSpUpdate { last_host_phase1_update_data: BTreeMap>, /// state of hashing each of the host phase1 slots phase1_hash_state: BTreeMap, + /// how do we decide when we're done hashing host phase1 slots? this allows + /// us to default to `TIME_TO_HASH_HOST_PHASE_1` (e.g., for running sp-sim + /// as a part of `omicron-dev`) while giving tests that want explicit + /// control the ability to precisely trigger completion of hashing. + phase1_hash_policy: HostFlashHashPolicy, /// records whether a change to the stage0 "active slot" has been requested pending_stage0_update: bool, @@ -188,6 +194,9 @@ impl SimSpUpdate { last_rot_update_data: None, last_host_phase1_update_data: BTreeMap::new(), phase1_hash_state: BTreeMap::new(), + phase1_hash_policy: HostFlashHashPolicy::Timer( + TIME_TO_HASH_HOST_PHASE_1, + ), pending_stage0_update: false, @@ -202,6 +211,16 @@ impl SimSpUpdate { } } + /// Instead of host phase 1 hashing completing after a few seconds, return a + /// handle that can be used to explicitly trigger completion. + pub(crate) fn set_phase1_hash_policy_explicit_control( + &mut self, + ) -> HostFlashHashCompletionSender { + let (tx, rx) = mpsc::unbounded_channel(); + self.phase1_hash_policy = HostFlashHashPolicy::Channel(rx); + HostFlashHashCompletionSender(tx) + } + pub(crate) fn sp_update_prepare( &mut self, id: UpdateId, @@ -530,8 +549,17 @@ impl SimSpUpdate { slot: u16, started: Instant, ) -> Result<[u8; 32], SpError> { - if started.elapsed() < TIME_TO_HASH_HOST_PHASE_1 { - return Err(SpError::Hf(HfError::HashInProgress)); + match &mut self.phase1_hash_policy { + HostFlashHashPolicy::Timer(duration) => { + if started.elapsed() < *duration { + return Err(SpError::Hf(HfError::HashInProgress)); + } + } + HostFlashHashPolicy::Channel(rx) => { + if rx.try_recv().is_err() { + return Err(SpError::Hf(HfError::HashInProgress)); + } + } } let data = self.last_host_phase1_update_data(slot); @@ -761,3 +789,23 @@ enum HostFlashHashState { Hashed([u8; 32]), HashInvalidated, } + +#[derive(Debug)] +enum HostFlashHashPolicy { + /// complete hashing after `Duration` has elapsed + Timer(Duration), + /// complete hashing if there's a message in this channel + Channel(mpsc::UnboundedReceiver<()>), +} + +pub struct HostFlashHashCompletionSender(mpsc::UnboundedSender<()>); + +impl HostFlashHashCompletionSender { + /// Allow the next request to get the hash result to succeed. + /// + /// Multiple calls to this function will queue multiple hash result + /// successes. + pub fn complete_next_hashing_attempt(&self) { + self.0.send(()).expect("receiving sp-sim instance is gone"); + } +} From dee6aedaf11dc56ba7a18e13dc65c77ab414dd51 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 14 Jul 2025 16:51:32 -0400 Subject: [PATCH 5/7] test cleanup --- nexus/mgs-updates/tests/host_phase1_hash.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nexus/mgs-updates/tests/host_phase1_hash.rs b/nexus/mgs-updates/tests/host_phase1_hash.rs index 87c5077ae31..0723c9f2e2d 100644 --- a/nexus/mgs-updates/tests/host_phase1_hash.rs +++ b/nexus/mgs-updates/tests/host_phase1_hash.rs @@ -250,4 +250,6 @@ async fn test_host_phase1_hashing() { (1, ComponentFirmwareHashStatus::Hashed(expected_sha256_1)), ]) .await; + + mgstestctx.teardown().await; } From a194d78009d9dd0488d78dcd83f5b5144bf42ec3 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 16 Jul 2025 15:52:43 -0400 Subject: [PATCH 6/7] expand API comments --- gateway-api/src/lib.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gateway-api/src/lib.rs b/gateway-api/src/lib.rs index 4b9ae62cfce..18d068cfb7f 100644 --- a/gateway-api/src/lib.rs +++ b/gateway-api/src/lib.rs @@ -263,8 +263,10 @@ pub trait GatewayApi { /// /// This endpoint is only valid for the `host-boot-flash` component. /// - /// Computing the hash takes several seconds; callers can poll for results - /// using `sp_component_hash_firmware_get()`. + /// Computing the hash takes several seconds; callers should poll for results + /// using `sp_component_hash_firmware_get()`. In general they should call + /// `sp_component_hash_firmware_get()` first anyway, as the hashes are + /// cached in the SP and may already be ready. #[endpoint { method = POST, path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}", @@ -278,8 +280,11 @@ pub trait GatewayApi { /// /// This endpoint is only valid for the `host-boot-flash` component. /// - /// Computing the hash takes several seconds; callers can start hashing - /// using `sp_component_hash_firmware_start()`. + /// Computing the hash takes several seconds; this endpoint returns the + /// current status. If the status is `HashNotStarted`, callers should start + /// hashing using `sp_component_hash_firmware_start()`. If the status is + /// `HashInProgress`, callers should wait a bit then call this endpoint + /// again. #[endpoint { method = GET, path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}", From 1d89597581ed4d255e739d642a1a10a23c628e34 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 17 Jul 2025 10:58:46 -0400 Subject: [PATCH 7/7] openapi comments --- openapi/gateway.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openapi/gateway.json b/openapi/gateway.json index c6fd494c0f1..23b216037bd 100644 --- a/openapi/gateway.json +++ b/openapi/gateway.json @@ -719,7 +719,7 @@ "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}": { "get": { "summary": "Get a computed hash of a given slot of a component.", - "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; callers can start hashing using `sp_component_hash_firmware_start()`.", + "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; this endpoint returns the current status. If the status is `HashNotStarted`, callers should start hashing using `sp_component_hash_firmware_start()`. If the status is `HashInProgress`, callers should wait a bit then call this endpoint again.", "operationId": "sp_component_hash_firmware_get", "parameters": [ { @@ -782,7 +782,7 @@ }, "post": { "summary": "Start computing the hash of a given slot of a component.", - "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; callers can poll for results using `sp_component_hash_firmware_get()`.", + "description": "This endpoint is only valid for the `host-boot-flash` component.\n\nComputing the hash takes several seconds; callers should poll for results using `sp_component_hash_firmware_get()`. In general they should call `sp_component_hash_firmware_get()` first anyway, as the hashes are cached in the SP and may already be ready.", "operationId": "sp_component_hash_firmware_start", "parameters": [ {