Skip to content

Commit 2d9d447

Browse files
authored
Reconfigurator: Disable decommissioning cockroach nodes (#8446)
See #8445 for more context. This PR disables decommissioning from both sides: * cockroach-admin will return a BAD_REQUEST for any decommissioning attempts. * The blueprint_executor RPW will not attempt to decommission expunged cockroach zones.
1 parent 5260a19 commit 2d9d447

File tree

3 files changed

+48
-19
lines changed

3 files changed

+48
-19
lines changed

cockroach-admin/src/cockroach_cli.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ impl CockroachCli {
179179
self.invoke_node_decommission(node_id).await
180180
}
181181

182+
// TODO-correctness These validation checks are NOT sufficient; see
183+
// https://github.com/oxidecomputer/omicron/issues/8445. Our request handler
184+
// for node decommissioning rejects requests, but we keep this method around
185+
// for documentation and because we hope to extend it to have correct checks
186+
// in the near future.
182187
fn validate_node_decommissionable(
183188
&self,
184189
node_id: &str,
@@ -191,7 +196,9 @@ impl CockroachCli {
191196
// 2. The node must not have any `gossiped_replicas` (the cockroach
192197
// cluster must not think this node is an active member of any ranges
193198
// - in practice, we see this go to 0 about 60 seconds after a node
194-
// goes offline)
199+
// goes offline). This attempts to guard against a window of time
200+
// where a node has gone offline but cockroach has not yet reflected
201+
// that fact in the counts of underreplicated ranges.
195202
// 3. The range descriptor for all ranges must not include the node we
196203
// want to decommission. This gate shouldn't be necessary, but exists
197204
// as a safeguard to avoid triggering what appears to be a CRDB race

cockroach-admin/src/http_entrypoints.rs

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -60,19 +60,16 @@ impl CockroachAdminApi for CockroachAdminImpl {
6060
}
6161

6262
async fn node_decommission(
63-
rqctx: RequestContext<Self::Context>,
64-
body: TypedBody<NodeId>,
63+
_rqctx: RequestContext<Self::Context>,
64+
_body: TypedBody<NodeId>,
6565
) -> Result<HttpResponseOk<NodeDecommission>, HttpError> {
66-
let ctx = rqctx.context();
67-
let NodeId { node_id } = body.into_inner();
68-
let decommission_status =
69-
ctx.cockroach_cli().node_decommission(&node_id).await?;
70-
info!(
71-
ctx.log(), "successfully decommissioned node";
72-
"node_id" => node_id,
73-
"status" => ?decommission_status,
74-
);
75-
Ok(HttpResponseOk(decommission_status))
66+
// We should call ctx.cockroach_cli().node_decommission(), but can't at
67+
// the moment due to timing concerns.
68+
Err(HttpError::for_bad_request(
69+
None,
70+
"decommissioning cockroach nodes not supported (see omicron#8445)"
71+
.to_string(),
72+
))
7673
}
7774

7875
async fn status_vars(

nexus/reconfigurator/execution/src/omicron_zones.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,19 @@ pub(crate) async fn clean_up_expunged_zones<R: CleanupResolver>(
3737
resolver: &R,
3838
blueprint: &Blueprint,
3939
) -> Result<(), Vec<anyhow::Error>> {
40+
// TODO-correctness Decommissioning cockroach nodes is currently disabled
41+
// while we work through issues with timing; see
42+
// https://github.com/oxidecomputer/omicron/issues/8445. We keep the
43+
// implementation around and gate it via an argument because we already have
44+
// unit tests for it and because we'd like to restore it once
45+
// cockroach-admin knows how to gate decommissioning correctly.
46+
let decommission_cockroach = false;
47+
4048
clean_up_expunged_zones_impl(
4149
opctx,
4250
datastore,
4351
resolver,
52+
decommission_cockroach,
4453
blueprint
4554
.all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup),
4655
)
@@ -51,6 +60,7 @@ async fn clean_up_expunged_zones_impl<R: CleanupResolver>(
5160
opctx: &OpContext,
5261
datastore: &DataStore,
5362
resolver: &R,
63+
decommission_cockroach: bool,
5464
zones_to_clean_up: impl Iterator<Item = (SledUuid, &BlueprintZoneConfig)>,
5565
) -> Result<(), Vec<anyhow::Error>> {
5666
let errors: Vec<anyhow::Error> = stream::iter(zones_to_clean_up)
@@ -66,12 +76,18 @@ async fn clean_up_expunged_zones_impl<R: CleanupResolver>(
6676
BlueprintZoneType::Nexus(_) => None,
6777

6878
// Zones which need cleanup after expungement.
69-
BlueprintZoneType::CockroachDb(_) => Some(
70-
decommission_cockroachdb_node(
71-
opctx, datastore, resolver, config.id, &log,
72-
)
73-
.await,
74-
),
79+
BlueprintZoneType::CockroachDb(_) => {
80+
if decommission_cockroach {
81+
Some(
82+
decommission_cockroachdb_node(
83+
opctx, datastore, resolver, config.id, &log,
84+
)
85+
.await,
86+
)
87+
} else {
88+
None
89+
}
90+
}
7591
BlueprintZoneType::Oximeter(_) => Some(
7692
oximeter_cleanup(opctx, datastore, config.id, &log).await,
7793
),
@@ -303,6 +319,11 @@ mod test {
303319
async fn test_clean_up_cockroach_zones(
304320
cptestctx: &ControlPlaneTestContext,
305321
) {
322+
// The whole point of this test is to check that we send decommissioning
323+
// requests; enable that. (See the real `clean_up_expunged_zones()` for
324+
// more context.)
325+
let decommission_cockroach = true;
326+
306327
// Test setup boilerplate.
307328
let nexus = &cptestctx.server.server_context().nexus;
308329
let datastore = nexus.datastore();
@@ -356,6 +377,7 @@ mod test {
356377
&opctx,
357378
datastore,
358379
&fake_resolver,
380+
decommission_cockroach,
359381
iter::once((any_sled_id, &crdb_zone)),
360382
)
361383
.await
@@ -402,6 +424,7 @@ mod test {
402424
&opctx,
403425
datastore,
404426
&fake_resolver,
427+
decommission_cockroach,
405428
iter::once((any_sled_id, &crdb_zone)),
406429
)
407430
.await
@@ -433,6 +456,7 @@ mod test {
433456
&opctx,
434457
datastore,
435458
&fake_resolver,
459+
decommission_cockroach,
436460
iter::once((any_sled_id, &crdb_zone)),
437461
)
438462
.await
@@ -461,6 +485,7 @@ mod test {
461485
&opctx,
462486
datastore,
463487
&fake_resolver,
488+
decommission_cockroach,
464489
iter::once((any_sled_id, &crdb_zone)),
465490
)
466491
.await

0 commit comments

Comments
 (0)