Skip to content

Commit c09902e

Browse files
authored
Add host phase 1 flash hashes to inventory (#8624)
Reconfigurator needs to know the current contents of the host phase 1 hash slots for OS upgrades; this change adds those hashes to inventory. It uses the new MGS endpoints landed in #8593.
1 parent d75ea8d commit c09902e

File tree

30 files changed

+832
-18
lines changed

30 files changed

+832
-18
lines changed

Cargo.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clients/gateway-client/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@ serde_json.workspace = true
2323
schemars.workspace = true
2424
slog.workspace = true
2525
thiserror.workspace = true
26+
tokio.workspace = true
2627
uuid.workspace = true
2728
omicron-workspace-hack.workspace = true

clients/gateway-client/src/lib.rs

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
//! Interface for API requests to a Management Gateway Service (MGS) instance
88
99
pub use gateway_messages::SpComponent;
10+
use std::time::Duration;
11+
use std::time::Instant;
12+
use types::ComponentFirmwareHashStatus;
1013

1114
// We specifically want to allow consumers, such as `wicketd`, to embed
1215
// inventory datatypes into their own APIs, rather than recreate structs.
@@ -97,3 +100,131 @@ impl PartialOrd for crate::types::SpIdentifier {
97100
Some(self.cmp(other))
98101
}
99102
}
103+
104+
#[derive(Debug, thiserror::Error)]
105+
pub enum HostPhase1HashError {
106+
#[error("timed out waiting for hash calculation")]
107+
Timeout,
108+
#[error("hash calculation failed (phase1 written while hashing?)")]
109+
ContentsModifiedWhileHashing,
110+
#[error("failed to send request to {kind}")]
111+
RequestError {
112+
kind: &'static str,
113+
#[source]
114+
err: Error<types::Error>,
115+
},
116+
}
117+
118+
impl Client {
119+
/// Get the hash of the host phase 1 flash contents in the given slot.
120+
///
121+
/// This operation is implemented asynchronously on the SP: a client (us)
122+
/// must request the hash be calculated, then poll until the calculation is
123+
/// complete. This method takes care of the "start / poll" operation; the
124+
/// caller must provide a timeout for how long they're willing to wait for
125+
/// the calculation to complete. In practice, we expect this to take a
126+
/// handful of seconds on real hardware.
127+
pub async fn host_phase_1_flash_hash_calculate_with_timeout(
128+
&self,
129+
sp: types::SpIdentifier,
130+
phase1_slot: u16,
131+
timeout: Duration,
132+
) -> Result<[u8; 32], HostPhase1HashError> {
133+
// The most common cases of calling this function are:
134+
//
135+
// 1. The hash is already calculated; we get it in the first `get`
136+
// operation below and return after a single request to MGS.
137+
// 2. The hash needs to be recalculated; we'll issue a "start hashing"
138+
// request then go into the polling loop. We expect to sit in that
139+
// loop for a handful of seconds.
140+
//
141+
// Given these, we could make this poll duration longer, since we know
142+
// the operation takes a little while. But there are two arguments for
143+
// polling somewhat more frequently:
144+
//
145+
// 1. Timeouts, timeouts, always wrong; if we believe hashing takes (by
146+
// way of example) 7 seconds, so we set the timeout to something
147+
// slightly larger than that (say 10 seconds), if a real device takes
148+
// slightly longer than our timeout, we now wait 20 seconds.
149+
// 2. An uncommon case of calling this function is that our initial
150+
// `get` returns `HashInProgress`; in this case we have no idea how
151+
// long the hashing has already been running, so would not know how
152+
// long to try to wait.
153+
//
154+
// It should be pretty cheap to poll the SP at 1 Hz, so we sidestep both
155+
// of those issues by doing so.
156+
const SLEEP_BETWEEN_POLLS: Duration = Duration::from_secs(1);
157+
const PHASE1_FLASH: &str =
158+
SpComponent::HOST_CPU_BOOT_FLASH.const_as_str();
159+
160+
let need_to_start_hashing = match self
161+
.sp_component_hash_firmware_get(
162+
sp.type_,
163+
sp.slot,
164+
PHASE1_FLASH,
165+
phase1_slot,
166+
)
167+
.await
168+
.map_err(|err| HostPhase1HashError::RequestError {
169+
kind: "get hash",
170+
err,
171+
})?
172+
.into_inner()
173+
{
174+
ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash),
175+
ComponentFirmwareHashStatus::HashInProgress => false,
176+
ComponentFirmwareHashStatus::HashNotCalculated => true,
177+
};
178+
179+
if need_to_start_hashing {
180+
// It's possible multiple Nexus instances race, all see
181+
// `HashNotCalculated` above, then all try to start hashing here.
182+
// The SP will accept the first request and return a
183+
// `HashInProgress` error for subsequent attempts, but MGS does its
184+
// best to make this operation idempotent; in particular, it will
185+
// catch a `HashInProgress` error here and return an HTTP success.
186+
// We'll return any other error.
187+
self.sp_component_hash_firmware_start(
188+
sp.type_,
189+
sp.slot,
190+
PHASE1_FLASH,
191+
phase1_slot,
192+
)
193+
.await
194+
.map_err(|err| HostPhase1HashError::RequestError {
195+
kind: "start hashing",
196+
err,
197+
})?;
198+
}
199+
200+
let start = Instant::now();
201+
loop {
202+
tokio::time::sleep(SLEEP_BETWEEN_POLLS).await;
203+
if start.elapsed() > timeout {
204+
return Err(HostPhase1HashError::Timeout);
205+
}
206+
match self
207+
.sp_component_hash_firmware_get(
208+
sp.type_,
209+
sp.slot,
210+
PHASE1_FLASH,
211+
phase1_slot,
212+
)
213+
.await
214+
.map_err(|err| HostPhase1HashError::RequestError {
215+
kind: "get hash",
216+
err,
217+
})?
218+
.into_inner()
219+
{
220+
ComponentFirmwareHashStatus::Hashed(hash) => return Ok(hash),
221+
ComponentFirmwareHashStatus::HashInProgress => continue,
222+
ComponentFirmwareHashStatus::HashNotCalculated => {
223+
return Err(
224+
HostPhase1HashError::ContentsModifiedWhileHashing,
225+
);
226+
}
227+
}
228+
}
229+
}
230+
}

common/src/disk.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,7 @@ impl DiskManagementError {
619619
Deserialize,
620620
Serialize,
621621
JsonSchema,
622+
strum::EnumIter,
622623
)]
623624
pub enum M2Slot {
624625
A,

dev-tools/omdb/tests/test_all_output.rs

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@ use nexus_test_utils_macros::nexus_test;
1616
use nexus_types::deployment::Blueprint;
1717
use nexus_types::deployment::SledFilter;
1818
use nexus_types::deployment::UnstableReconfiguratorState;
19+
use omicron_common::api::external::SwitchLocation;
1920
use omicron_test_utils::dev::test_cmds::Redactor;
2021
use omicron_test_utils::dev::test_cmds::path_to_executable;
2122
use omicron_test_utils::dev::test_cmds::run_command;
2223
use slog_error_chain::InlineErrorChain;
2324
use std::fmt::Write;
2425
use std::net::IpAddr;
2526
use std::path::Path;
27+
use std::time::Duration;
2628
use subprocess::Exec;
2729
use uuid::Uuid;
2830

@@ -131,17 +133,20 @@ async fn test_omdb_usage_errors() {
131133
async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
132134
clear_omdb_env();
133135

134-
let gwtestctx = gateway_test_utils::setup::test_setup(
135-
"test_omdb_success_case",
136-
gateway_messages::SpPort::One,
137-
)
138-
.await;
139136
let cmd_path = path_to_executable(CMD_OMDB);
140137

141138
let postgres_url = cptestctx.database.listen_url();
142139
let nexus_internal_url =
143140
format!("http://{}/", cptestctx.internal_client.bind_address);
144-
let mgs_url = format!("http://{}/", gwtestctx.client.bind_address);
141+
let mgs_url = format!(
142+
"http://{}/",
143+
cptestctx
144+
.gateway
145+
.get(&SwitchLocation::Switch0)
146+
.expect("nexus_test always sets up MGS on switch 0")
147+
.client
148+
.bind_address
149+
);
145150
let ox_url = format!("http://{}/", cptestctx.oximeter.server_address());
146151
let ox_test_producer = cptestctx.producer.address().ip();
147152
let ch_url = format!("http://{}/", cptestctx.clickhouse.http_address());
@@ -165,6 +170,13 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
165170
)
166171
.await;
167172

173+
// Wait for Nexus to have gathered at least one inventory collection. (We'll
174+
// check below that `reconfigurator export` contains at least one, so have
175+
// to wait until there's one to export.)
176+
cptestctx
177+
.wait_for_at_least_one_inventory_collection(Duration::from_secs(60))
178+
.await;
179+
168180
let mut output = String::new();
169181

170182
let invocations: &[&[&str]] = &[
@@ -319,8 +331,6 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
319331
&ox_url,
320332
ox_test_producer,
321333
);
322-
323-
gwtestctx.teardown().await;
324334
}
325335

326336
/// Verify that we properly deal with cases where:

dev-tools/reconfigurator-cli/tests/input/cmds-example.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ load-example --seed test-basic --nsleds 3 --sled-policy 1:non-provisionable --sl
5252

5353
blueprint-list
5454
blueprint-show latest
55-
inventory-show latest
55+
inventory-show latest all
5656

5757
# Plan a blueprint run -- this will cause zones and disks on the expunged
5858
# sled to be expunged.

dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1013,13 +1013,79 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9
10131013
PENDING MGS-MANAGED UPDATES: 0
10141014

10151015

1016-
> inventory-show latest
1016+
> inventory-show latest all
10171017
collection: 9e187896-7809-46d0-9210-d75be1b3c4d4
10181018
collector: example
10191019
started: <REDACTED_TIMESTAMP>
10201020
done: <REDACTED_TIMESTAMP>
10211021
errors: 0
10221022

1023+
Sled serial0
1024+
part number: model0
1025+
power: A2
1026+
revision: 0
1027+
MGS slot: Sled 0 (cubby 0)
1028+
found at: <REDACTED_TIMESTAMP> from fake MGS 1
1029+
host phase 1 hashes:
1030+
SLOT HASH
1031+
A 0101010101010101010101010101010101010101010101010101010101010101
1032+
B 0202020202020202020202020202020202020202020202020202020202020202
1033+
cabooses:
1034+
SLOT BOARD NAME VERSION GIT_COMMIT SIGN
1035+
SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a
1036+
RoT pages:
1037+
SLOT DATA_BASE64
1038+
RoT: active slot: slot A
1039+
RoT: persistent boot preference: slot A
1040+
RoT: pending persistent boot preference: -
1041+
RoT: transient boot preference: -
1042+
RoT: slot A SHA3-256: slotAdigest1
1043+
RoT: slot B SHA3-256: slotBdigest1
1044+
1045+
Sled serial1
1046+
part number: model1
1047+
power: A2
1048+
revision: 0
1049+
MGS slot: Sled 1 (cubby 1)
1050+
found at: <REDACTED_TIMESTAMP> from fake MGS 1
1051+
host phase 1 hashes:
1052+
SLOT HASH
1053+
A 0101010101010101010101010101010101010101010101010101010101010101
1054+
B 0202020202020202020202020202020202020202020202020202020202020202
1055+
cabooses:
1056+
SLOT BOARD NAME VERSION GIT_COMMIT SIGN
1057+
SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a
1058+
RoT pages:
1059+
SLOT DATA_BASE64
1060+
RoT: active slot: slot A
1061+
RoT: persistent boot preference: slot A
1062+
RoT: pending persistent boot preference: -
1063+
RoT: transient boot preference: -
1064+
RoT: slot A SHA3-256: slotAdigest1
1065+
RoT: slot B SHA3-256: slotBdigest1
1066+
1067+
Sled serial2
1068+
part number: model2
1069+
power: A2
1070+
revision: 0
1071+
MGS slot: Sled 2 (cubby 2)
1072+
found at: <REDACTED_TIMESTAMP> from fake MGS 1
1073+
host phase 1 hashes:
1074+
SLOT HASH
1075+
A 0101010101010101010101010101010101010101010101010101010101010101
1076+
B 0202020202020202020202020202020202020202020202020202020202020202
1077+
cabooses:
1078+
SLOT BOARD NAME VERSION GIT_COMMIT SIGN
1079+
SpSlot0 SimGimletSp SimGimletSp 0.0.1 unknown n/a
1080+
RoT pages:
1081+
SLOT DATA_BASE64
1082+
RoT: active slot: slot A
1083+
RoT: persistent boot preference: slot A
1084+
RoT: pending persistent boot preference: -
1085+
RoT: transient boot preference: -
1086+
RoT: slot A SHA3-256: slotAdigest1
1087+
RoT: slot B SHA3-256: slotBdigest1
1088+
10231089
SLED AGENTS
10241090

10251091
sled 2eb69596-f081-4e2d-9425-9994926e0832 (role = Gimlet, serial serial1)

0 commit comments

Comments
 (0)