Skip to content

[MGS] Add endpoints for host phase 1 flash hashing #8593

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions clients/gateway-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ progenitor::generate_api!(
}),
derives = [schemars::JsonSchema],
patch = {
ComponentFirmwareHashStatus = { derives = [PartialEq, Eq, PartialOrd, Ord] },
HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] },
ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] },
RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] },
Expand Down
50 changes: 49 additions & 1 deletion gateway-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use gateway_types::{
SpState,
},
component_details::SpComponentDetails,
host::HostStartupOptions,
host::{ComponentFirmwareHashStatus, HostStartupOptions},
ignition::{IgnitionCommand, SpIgnitionInfo},
rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState},
sensor::SpSensorReading,
Expand Down Expand Up @@ -259,6 +259,41 @@ pub trait GatewayApi {
path: Path<PathSpComponent>,
) -> Result<HttpResponseOk<SpUpdateStatus>, HttpError>;

/// Start computing the hash of a given slot of a component.
///
/// This endpoint is only valid for the `host-boot-flash` component.
///
/// Computing the hash takes several seconds; callers should poll for results
/// using `sp_component_hash_firmware_get()`. In general they should call
/// `sp_component_hash_firmware_get()` first anyway, as the hashes are
/// cached in the SP and may already be ready.
#[endpoint {
method = POST,
path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}",
}]
async fn sp_component_hash_firmware_start(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpComponentFirmwareSlot>,
) -> Result<HttpResponseUpdatedNoContent, HttpError>;

/// Get a computed hash of a given slot of a component.
///
/// This endpoint is only valid for the `host-boot-flash` component.
///
/// Computing the hash takes several seconds; this endpoint returns the
/// current status. If the status is `HashNotStarted`, callers should start
/// hashing using `sp_component_hash_firmware_start()`. If the status is
/// `HashInProgress`, callers should wait a bit then call this endpoint
/// again.
#[endpoint {
method = GET,
path = "/sp/{type}/{slot}/component/{component}/hash/{firmware_slot}",
}]
async fn sp_component_hash_firmware_get(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpComponentFirmwareSlot>,
) -> Result<HttpResponseOk<ComponentFirmwareHashStatus>, HttpError>;

/// Abort any in-progress update an SP component
///
/// Aborting an update to the SP itself is done via the component name
Expand Down Expand Up @@ -542,6 +577,19 @@ pub struct PathSpComponent {
pub component: String,
}

#[derive(Deserialize, JsonSchema)]
pub struct PathSpComponentFirmwareSlot {
/// ID for the SP that the gateway service translates into the appropriate
/// port for communicating with the given SP.
#[serde(flatten)]
pub sp: SpIdentifier,
/// ID for the component of the SP; this is the internal identifier used by
/// the SP itself to identify its components.
pub component: String,
/// Firmware slot of the component.
pub firmware_slot: u16,
}

#[derive(Deserialize, JsonSchema)]
pub struct PathSpTaskDumpIndex {
/// ID for the SP that the gateway service translates into the appropriate
Expand Down
15 changes: 15 additions & 0 deletions gateway-types/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,18 @@ impl From<StartupOptions> for HostStartupOptions {
}
}
}

#[derive(Serialize, Deserialize, JsonSchema)]
#[serde(tag = "status", rename_all = "snake_case")]
pub enum ComponentFirmwareHashStatus {
/// The hash is not available; the client must issue a separate request to
/// begin calculating the hash.
HashNotCalculated,
/// The hash is currently being calculated; the client should sleep briefly
/// then check again.
///
/// We expect this operation to take a handful of seconds in practice.
HashInProgress,
/// The hash of the given firmware slot.
Hashed { sha256: [u8; 32] },
}
90 changes: 90 additions & 0 deletions gateway/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ use dropshot::WebsocketEndpointResult;
use dropshot::WebsocketUpgrade;
use futures::TryFutureExt;
use gateway_api::*;
use gateway_messages::HfError;
use gateway_messages::RotBootInfo;
use gateway_messages::SpComponent;
use gateway_messages::SpError;
use gateway_sp_comms::HostPhase2Provider;
use gateway_sp_comms::VersionedSpState;
use gateway_sp_comms::error::CommunicationError;
Expand All @@ -36,6 +38,7 @@ use gateway_types::component::SpComponentList;
use gateway_types::component::SpIdentifier;
use gateway_types::component::SpState;
use gateway_types::component_details::SpComponentDetails;
use gateway_types::host::ComponentFirmwareHashStatus;
use gateway_types::host::HostStartupOptions;
use gateway_types::ignition::SpIgnitionInfo;
use gateway_types::rot::RotCfpa;
Expand Down Expand Up @@ -536,6 +539,93 @@ impl GatewayApi for GatewayImpl {
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn sp_component_hash_firmware_start(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpComponentFirmwareSlot>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let apictx = rqctx.context();

let PathSpComponentFirmwareSlot { sp, component, firmware_slot } =
path.into_inner();
let sp_id = sp.into();
let handler = async {
let sp = apictx.mgmt_switch.sp(sp_id)?;
let component = component_from_str(&component)?;

if component != SpComponent::HOST_CPU_BOOT_FLASH {
return Err(HttpError::for_bad_request(
Some("RequestUnsupportedForComponent".to_string()),
"Only the host boot flash can be hashed".into(),
));
}

// The SP (reasonably!) returns a `HashInProgress` error if we try
// to start hashing while hashing is being calculated, but we're
// presenting an idempotent "start hashing if it isn't started"
// endpoint instead. Swallow that error.
Comment on lines +562 to +565
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible for the hashing to fail after it has reported it's in progress? If that happens we might still get a HttpResponseUpdatedNoContent even though the hashing failed no? What would the consequences of this be?

if sp.start_host_flash_hash(firmware_slot) returns HfError::HashInProgress, would it make sense to call sp.get_host_flash_hash(firmware_slot) on a loop with a timeout until we get an Ok(), or another error?

Copy link
Contributor Author

@jgallagher jgallagher Jul 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible for the hashing to fail after it has reported it's in progress?

Yeah, definitely, but it would have a different error code.

If that happens we might still get a HttpResponseUpdatedNoContent even though the hashing failed no?

No, I don't think so - the only error we turn into HttpResponseUpdatedNoContent is HfError::HashInProgress; any other error turns into an SpCommunicationFailed in the second arm of the match.

If the SP gets stuck and returns HashInProgress indefinitely, we'd keep returning HttpResponseUpdatedNoContent from this method, but presumably a client will be polling the get endpoint with a timeout. Which gets to your second point!

if sp.start_host_flash_hash(firmware_slot) returns HfError::HashInProgress, would it make sense to call sp.get_host_flash_hash(firmware_slot) on a loop with a timeout until we get an Ok(), or another error?

Maybe - I definitely considered this! At some level someone has to do exactly that, and it's a question of who:

a) Put it in MGS - tempting, but now MGS has to have (or accept from its client) a timeout for that loop.
b) Expose these two endpoints as-is from MGS and make Nexus do the looping + timeout

I think I slightly prefer b, just because of a bias to have MGS do as little as possible in principle? Nexus already has to deal with looping and timeouts for all kinds of update-related things, so adding one more seems better than putting one in MGS.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but presumably a client will be polling the get endpoint with a timeout. Which gets to your second point!

Ah! ok, that's the bit of information I was missing here. Can we add a comment explaining this?

I think I slightly prefer b, just because of a bias to have MGS do as little as possible in principle? Nexus already has to deal with looping and timeouts for all kinds of update-related things, so adding one more seems better than putting one in MGS.

Yeah, that makes sense to me as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure - I expanded the comments in a194d78

match sp.start_host_flash_hash(firmware_slot).await {
Ok(())
| Err(CommunicationError::SpError(SpError::Hf(
HfError::HashInProgress,
))) => Ok(HttpResponseUpdatedNoContent()),
Err(err) => {
Err(SpCommsError::SpCommunicationFailed { sp: sp_id, err }
.into())
}
}
};
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn sp_component_hash_firmware_get(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpComponentFirmwareSlot>,
) -> Result<HttpResponseOk<ComponentFirmwareHashStatus>, HttpError> {
let apictx = rqctx.context();

let PathSpComponentFirmwareSlot { sp, component, firmware_slot } =
path.into_inner();
let sp_id = sp.into();
let handler = async {
let sp = apictx.mgmt_switch.sp(sp_id)?;
let component = component_from_str(&component)?;

if component != SpComponent::HOST_CPU_BOOT_FLASH {
return Err(HttpError::for_bad_request(
Some("RequestUnsupportedForComponent".to_string()),
"Only the host boot flash can be hashed".into(),
));
}

let status = match sp.get_host_flash_hash(firmware_slot).await {
// success
Ok(sha256) => ComponentFirmwareHashStatus::Hashed { sha256 },

// expected failure: hash needs to be calculated (or
// recalculated; either way the client operation is the same)
Err(CommunicationError::SpError(SpError::Hf(
HfError::HashUncalculated | HfError::RecalculateHash,
))) => ComponentFirmwareHashStatus::HashNotCalculated,

// expected failure: hashing is currently in progress; client
// needs to wait and try again later
Err(CommunicationError::SpError(SpError::Hf(
HfError::HashInProgress,
))) => ComponentFirmwareHashStatus::HashInProgress,

// other errors are failures
Err(err) => {
return Err(HttpError::from(
SpCommsError::SpCommunicationFailed { sp: sp_id, err },
));
}
};

Ok(HttpResponseOk(status))
};
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn sp_component_update_abort(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpComponent>,
Expand Down
1 change: 1 addition & 0 deletions nexus/mgs-updates/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ dropshot.workspace = true
gateway-messages.workspace = true
gateway-test-utils.workspace = true
hubtools.workspace = true
omicron-test-utils.workspace = true
rand.workspace = true
repo-depot-api.workspace = true
sp-sim.workspace = true
Loading
Loading