Skip to content

Commit 80a9e91

Browse files
authored
[24/n] [reconfigurator-planning] support no-op image source updates (#8486)
RFD 556 discusses part of the mupdate recovery process being a zone image source switch from the install dataset to the TUF repo depot. During discussions, we figured out that this doesn't have to be tied to the mupdate override stuff for the most part, and that this is a process we are allowed to perform on any zones where a mupdate override isn't in place. This depends on some pending work in Sled Agent to treat image source switches as no-ops if the underlying hash is the same.
1 parent 58290e9 commit 80a9e91

File tree

20 files changed

+1259
-31
lines changed

20 files changed

+1259
-31
lines changed

Cargo.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/reconfigurator-cli/src/lib.rs

Lines changed: 178 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
//! developer REPL for driving blueprint planning
66
77
use anyhow::{Context, anyhow, bail};
8-
use camino::Utf8PathBuf;
8+
use camino::{Utf8Path, Utf8PathBuf};
99
use clap::ValueEnum;
1010
use clap::{Args, Parser, Subcommand};
1111
use iddqd::IdOrdMap;
@@ -20,9 +20,9 @@ use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
2020
use nexus_reconfigurator_planning::example::ExampleSystemBuilder;
2121
use nexus_reconfigurator_planning::planner::Planner;
2222
use nexus_reconfigurator_planning::system::{SledBuilder, SystemDescription};
23-
use nexus_reconfigurator_simulation::SimStateBuilder;
24-
use nexus_reconfigurator_simulation::Simulator;
2523
use nexus_reconfigurator_simulation::{BlueprintId, SimState};
24+
use nexus_reconfigurator_simulation::{SimStateBuilder, SimTufRepoSource};
25+
use nexus_reconfigurator_simulation::{SimTufRepoDescription, Simulator};
2626
use nexus_sled_agent_shared::inventory::ZoneKind;
2727
use nexus_types::deployment::PlanningInput;
2828
use nexus_types::deployment::SledFilter;
@@ -39,18 +39,19 @@ use nexus_types::deployment::{OmicronZoneNic, TargetReleaseDescription};
3939
use nexus_types::external_api::views::SledPolicy;
4040
use nexus_types::external_api::views::SledProvisionPolicy;
4141
use omicron_common::address::REPO_DEPOT_PORT;
42-
use omicron_common::api::external::Generation;
4342
use omicron_common::api::external::Name;
43+
use omicron_common::api::external::{Generation, TufRepoDescription};
4444
use omicron_common::policy::NEXUS_REDUNDANCY;
45+
use omicron_common::update::OmicronZoneManifestSource;
4546
use omicron_repl_utils::run_repl_from_file;
4647
use omicron_repl_utils::run_repl_on_stdin;
47-
use omicron_uuid_kinds::CollectionUuid;
4848
use omicron_uuid_kinds::GenericUuid;
4949
use omicron_uuid_kinds::OmicronZoneUuid;
5050
use omicron_uuid_kinds::ReconfiguratorSimUuid;
5151
use omicron_uuid_kinds::SledUuid;
5252
use omicron_uuid_kinds::VnicUuid;
5353
use omicron_uuid_kinds::{BlueprintUuid, MupdateOverrideUuid};
54+
use omicron_uuid_kinds::{CollectionUuid, MupdateUuid};
5455
use std::borrow::Cow;
5556
use std::convert::Infallible;
5657
use std::fmt::{self, Write};
@@ -220,6 +221,9 @@ fn process_command(
220221
Commands::SledRemove(args) => cmd_sled_remove(sim, args),
221222
Commands::SledShow(args) => cmd_sled_show(sim, args),
222223
Commands::SledSetPolicy(args) => cmd_sled_set_policy(sim, args),
224+
Commands::SledUpdateInstallDataset(args) => {
225+
cmd_sled_update_install_dataset(sim, args)
226+
}
223227
Commands::SledUpdateSp(args) => cmd_sled_update_sp(sim, args),
224228
Commands::SiloList => cmd_silo_list(sim),
225229
Commands::SiloAdd(args) => cmd_silo_add(sim, args),
@@ -275,6 +279,8 @@ enum Commands {
275279
SledShow(SledArgs),
276280
/// set a sled's policy
277281
SledSetPolicy(SledSetPolicyArgs),
282+
/// update the install dataset on a sled, simulating a mupdate
283+
SledUpdateInstallDataset(SledUpdateInstallDatasetArgs),
278284
/// simulate updating the sled's SP versions
279285
SledUpdateSp(SledUpdateSpArgs),
280286

@@ -395,6 +401,52 @@ impl From<SledPolicyOpt> for SledPolicy {
395401
}
396402
}
397403

404+
#[derive(Debug, Args)]
405+
struct SledUpdateInstallDatasetArgs {
406+
/// id of the sled
407+
sled_id: SledOpt,
408+
409+
#[clap(flatten)]
410+
source: SledMupdateSource,
411+
}
412+
413+
#[derive(Debug, Args)]
414+
// This makes it so that only one source can be specified.
415+
struct SledMupdateSource {
416+
#[clap(flatten)]
417+
valid: SledMupdateValidSource,
418+
419+
/// set the mupdate source to Installinator with the given ID
420+
#[clap(long, requires = "sled-mupdate-valid-source")]
421+
mupdate_id: Option<MupdateUuid>,
422+
423+
/// simulate an error reading the zone manifest
424+
#[clap(long, conflicts_with = "sled-mupdate-valid-source")]
425+
with_manifest_error: bool,
426+
427+
/// simulate an error validating zones by this artifact ID name
428+
///
429+
/// This uses the `artifact_id_name` representation of a zone kind.
430+
#[clap(
431+
long,
432+
value_name = "ARTIFACT_ID_NAME",
433+
requires = "sled-mupdate-valid-source"
434+
)]
435+
with_zone_error: Vec<String>,
436+
}
437+
438+
#[derive(Debug, Args)]
439+
#[group(id = "sled-mupdate-valid-source", multiple = false)]
440+
struct SledMupdateValidSource {
441+
/// the TUF repo.zip to simulate the mupdate from
442+
#[clap(long)]
443+
from_repo: Option<Utf8PathBuf>,
444+
445+
/// simulate a mupdate to the target release
446+
#[clap(long)]
447+
to_target_release: bool,
448+
}
449+
398450
#[derive(Debug, Args)]
399451
struct SledUpdateSpArgs {
400452
/// id of the sled
@@ -879,6 +931,10 @@ struct TufAssembleArgs {
879931
/// The tufaceous manifest path (relative to this crate's root)
880932
manifest_path: Utf8PathBuf,
881933

934+
/// Allow non-semver artifact versions.
935+
#[clap(long)]
936+
allow_non_semver: bool,
937+
882938
#[clap(
883939
long,
884940
// Use help here rather than a doc comment because rustdoc doesn't like
@@ -1156,6 +1212,32 @@ fn cmd_sled_set_policy(
11561212
Ok(Some(format!("set sled {} policy to {}", sled_id, args.policy)))
11571213
}
11581214

1215+
fn cmd_sled_update_install_dataset(
1216+
sim: &mut ReconfiguratorSim,
1217+
args: SledUpdateInstallDatasetArgs,
1218+
) -> anyhow::Result<Option<String>> {
1219+
let description = mupdate_source_to_description(sim, &args.source)?;
1220+
1221+
let mut state = sim.current_state().to_mut();
1222+
let system = state.system_mut();
1223+
let sled_id = args.sled_id.to_sled_id(system.description())?;
1224+
system
1225+
.description_mut()
1226+
.sled_set_zone_manifest(sled_id, description.to_boot_inventory())?;
1227+
1228+
sim.commit_and_bump(
1229+
format!(
1230+
"reconfigurator-cli sled-update-install-dataset: {}",
1231+
description.message,
1232+
),
1233+
state,
1234+
);
1235+
Ok(Some(format!(
1236+
"sled {}: install dataset updated: {}",
1237+
sled_id, description.message
1238+
)))
1239+
}
1240+
11591241
fn cmd_sled_update_sp(
11601242
sim: &mut ReconfiguratorSim,
11611243
args: SledUpdateSpArgs,
@@ -1955,26 +2037,8 @@ fn cmd_set(
19552037
rv
19562038
}
19572039
SetArgs::TargetRelease { filename } => {
1958-
let file = std::fs::File::open(&filename)
1959-
.with_context(|| format!("open {:?}", filename))?;
1960-
let buf = std::io::BufReader::new(file);
1961-
let rt = tokio::runtime::Runtime::new()
1962-
.context("creating tokio runtime")?;
1963-
// We're not using the repo hash here. Make one up.
1964-
let repo_hash = ArtifactHash([0; 32]);
1965-
let artifacts_with_plan = rt.block_on(async {
1966-
ArtifactsWithPlan::from_zip(
1967-
buf,
1968-
None,
1969-
repo_hash,
1970-
ControlPlaneZonesMode::Split,
1971-
&sim.log,
1972-
)
1973-
.await
1974-
.with_context(|| format!("unpacking {:?}", filename))
1975-
})?;
1976-
let description = artifacts_with_plan.description().clone();
1977-
drop(artifacts_with_plan);
2040+
let description =
2041+
extract_tuf_repo_description(&sim.log, &filename)?;
19782042
state.system_mut().description_mut().set_target_release(
19792043
TargetReleaseDescription::TufRepo(description),
19802044
);
@@ -1986,6 +2050,84 @@ fn cmd_set(
19862050
Ok(Some(rv))
19872051
}
19882052

2053+
/// Converts a mupdate source to a TUF repo description.
2054+
fn mupdate_source_to_description(
2055+
sim: &ReconfiguratorSim,
2056+
source: &SledMupdateSource,
2057+
) -> anyhow::Result<SimTufRepoDescription> {
2058+
let manifest_source = match source.mupdate_id {
2059+
Some(mupdate_id) => {
2060+
OmicronZoneManifestSource::Installinator { mupdate_id }
2061+
}
2062+
None => OmicronZoneManifestSource::SledAgent,
2063+
};
2064+
if let Some(repo_path) = &source.valid.from_repo {
2065+
let description = extract_tuf_repo_description(&sim.log, repo_path)?;
2066+
let mut sim_source = SimTufRepoSource::new(
2067+
description,
2068+
manifest_source,
2069+
format!("from repo at {repo_path}"),
2070+
)?;
2071+
sim_source.simulate_zone_errors(&source.with_zone_error)?;
2072+
Ok(SimTufRepoDescription::new(sim_source))
2073+
} else if source.valid.to_target_release {
2074+
let description = sim
2075+
.current_state()
2076+
.system()
2077+
.description()
2078+
.target_release()
2079+
.description();
2080+
match description {
2081+
TargetReleaseDescription::Initial => {
2082+
bail!(
2083+
"cannot mupdate zones without a target release \
2084+
(use `set target-release` or --from-repo)"
2085+
)
2086+
}
2087+
TargetReleaseDescription::TufRepo(desc) => {
2088+
let mut sim_source = SimTufRepoSource::new(
2089+
desc.clone(),
2090+
manifest_source,
2091+
"to target release".to_owned(),
2092+
)?;
2093+
sim_source.simulate_zone_errors(&source.with_zone_error)?;
2094+
Ok(SimTufRepoDescription::new(sim_source))
2095+
}
2096+
}
2097+
} else if source.with_manifest_error {
2098+
Ok(SimTufRepoDescription::new_error(
2099+
"simulated error obtaining zone manifest".to_owned(),
2100+
))
2101+
} else {
2102+
bail!("an update source must be specified")
2103+
}
2104+
}
2105+
2106+
fn extract_tuf_repo_description(
2107+
log: &slog::Logger,
2108+
filename: &Utf8Path,
2109+
) -> anyhow::Result<TufRepoDescription> {
2110+
let file = std::fs::File::open(filename)
2111+
.with_context(|| format!("open {:?}", filename))?;
2112+
let buf = std::io::BufReader::new(file);
2113+
let rt =
2114+
tokio::runtime::Runtime::new().context("creating tokio runtime")?;
2115+
let repo_hash = ArtifactHash([0; 32]);
2116+
let artifacts_with_plan = rt.block_on(async {
2117+
ArtifactsWithPlan::from_zip(
2118+
buf,
2119+
None,
2120+
repo_hash,
2121+
ControlPlaneZonesMode::Split,
2122+
log,
2123+
)
2124+
.await
2125+
.with_context(|| format!("unpacking {:?}", filename))
2126+
})?;
2127+
let description = artifacts_with_plan.description().clone();
2128+
Ok(description)
2129+
}
2130+
19892131
fn cmd_tuf_assemble(
19902132
sim: &ReconfiguratorSim,
19912133
args: TufAssembleArgs,
@@ -2016,18 +2158,26 @@ fn cmd_tuf_assemble(
20162158
Utf8PathBuf::from(format!("repo-{}.zip", manifest.system_version))
20172159
};
20182160

2161+
if output_path.exists() {
2162+
bail!("output path `{output_path}` already exists");
2163+
}
2164+
20192165
// Just use a fixed key for now.
20202166
//
20212167
// In the future we may want to test changing the TUF key.
2022-
let args = tufaceous::Args::try_parse_from([
2168+
let mut tufaceous_args = vec![
20232169
"tufaceous",
20242170
"--key",
20252171
DEFAULT_TUFACEOUS_KEY,
20262172
"assemble",
20272173
manifest_path.as_str(),
20282174
output_path.as_str(),
2029-
])
2030-
.expect("args are valid so this shouldn't fail");
2175+
];
2176+
if args.allow_non_semver {
2177+
tufaceous_args.push("--allow-non-semver");
2178+
}
2179+
let args = tufaceous::Args::try_parse_from(tufaceous_args)
2180+
.expect("args are valid so this shouldn't fail");
20312181
let rt =
20322182
tokio::runtime::Runtime::new().context("creating tokio runtime")?;
20332183
rt.block_on(async move { args.exec(&sim.log).await })
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Load an example system. The sled with serial5 is marked non-provisionable
2+
# so that discretionary zones don't make their way onto it. (We're going to
3+
# expunge it below to test that we don't try and update zone image sources
4+
# on expunged sleds.)
5+
load-example --nsleds 6 --ndisks-per-sled 1 --sled-policy 5:non-provisionable
6+
7+
sled-list
8+
9+
# Create a TUF repository from a fake manifest. (The output TUF repo is
10+
# written to a temporary directory that this invocation of `reconfigurator-cli`
11+
# is running out of as its working directory.)
12+
tuf-assemble ../../update-common/manifests/fake.toml
13+
# Create a second TUF repository from a different fake manifest.
14+
tuf-assemble ../../update-common/manifests/fake-non-semver.toml --allow-non-semver
15+
16+
# Load the target release from the first TUF repository.
17+
set target-release repo-1.0.0.zip
18+
19+
# On one sled, update the install dataset.
20+
sled-update-install-dataset serial0 --to-target-release
21+
22+
# On another sled, simulate an error reading the zone manifest.
23+
sled-update-install-dataset serial1 --with-manifest-error
24+
25+
# On a third sled, update the install dataset and simulate a mupdate override.
26+
# (Currently we do this in the blueprint, but with
27+
# https://github.com/oxidecomputer/omicron/pull/8456 we should update this test and
28+
# set a mupdate-override on the sled directly.)
29+
sled-update-install-dataset serial2 --to-target-release
30+
blueprint-edit latest set-remove-mupdate-override serial2 ffffffff-ffff-ffff-ffff-ffffffffffff
31+
32+
# On a fourth sled, simulate an error validating the install dataset image on one zone.
33+
# We pick ntp because internal-ntp is non-discretionary.
34+
sled-update-install-dataset serial3 --to-target-release --with-zone-error ntp
35+
36+
# On a fifth sled, set the install dataset to the repo-2.0.0.zip generated by the
37+
# second TUF repository.
38+
sled-update-install-dataset serial4 --from-repo repo-2.0.0.zip
39+
40+
# On the sixth sled, update to the target release (so it shows up in inventory).
41+
# Then, mark the sled expunged (in the planning input).
42+
sled-update-install-dataset serial5 --to-target-release
43+
sled-set-policy serial5 expunged
44+
45+
# Generate an inventory and run a blueprint planning step.
46+
inventory-generate
47+
blueprint-plan latest eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51
48+
49+
# This diff should show expected changes to the blueprint.
50+
blueprint-diff 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 latest

dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ generated inventory collection eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 from configu
3636
> # Try to plan a new blueprint; this should be okay even though the sled
3737
> # we added has no disks.
3838
> blueprint-plan dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51
39+
INFO skipping noop image source check for all sleds (no current TUF repo)
3940
INFO skipping sled (no zpools in service), sled_id: 00320471-945d-413c-85e7-03e091a70b3c
4041
INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0
4142
INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1

dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,7 @@ T ENA ID PARENT
493493
* yes ade5749d-bdf3-4fab-a8ae-00bea01b3a5a 02697f74-b14a-4418-90f0-c28b2a3a6aa9 <REDACTED_TIMESTAMP>
494494

495495
> blueprint-plan ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
496+
INFO skipping noop image source check for all sleds (no current TUF repo)
496497
INFO found sled missing NTP zone (will add one), sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b
497498
INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0
498499
WARN failed to place all new desired Clickhouse zones, placed: 0, wanted_to_place: 1
@@ -936,6 +937,7 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9
936937
> # Plan a blueprint run -- this will cause zones and disks on the expunged
937938
> # sled to be expunged.
938939
> blueprint-plan latest
940+
INFO skipping noop image source check for all sleds (no current TUF repo)
939941
INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0
940942
INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1
941943
INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0

0 commit comments

Comments
 (0)