Skip to content

Commit fc4d99d

Browse files
authored
Add bg task for collecting chicken switches from DB (#8462)
Wire up the task watcher into the planner so that it only runs when the `planner_enabled` switch is set to `true. Fixes #8253
1 parent f0029f5 commit fc4d99d

File tree

16 files changed

+228
-20
lines changed

16 files changed

+228
-20
lines changed

dev-tools/omdb/src/bin/omdb/nexus/chicken_switches.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
77
use crate::Omdb;
88
use crate::check_allow_destructive::DestructiveOperationToken;
9+
use clap::ArgAction;
910
use clap::Args;
1011
use clap::Subcommand;
1112
use http::StatusCode;
@@ -33,6 +34,7 @@ pub enum ChickenSwitchesCommands {
3334

3435
#[derive(Debug, Clone, Args)]
3536
pub struct ChickenSwitchesSetArgs {
37+
#[clap(long, action=ArgAction::Set)]
3638
planner_enabled: bool,
3739
}
3840

@@ -100,7 +102,13 @@ async fn chicken_switches_show(
100102
println!(" modified time: {time_modified}");
101103
println!(" planner enabled: {planner_enabled}");
102104
}
103-
Err(err) => eprintln!("error: {:#}", err),
105+
Err(err) => {
106+
if err.status() == Some(StatusCode::NOT_FOUND) {
107+
println!("No chicken switches enabled");
108+
} else {
109+
eprintln!("error: {:#}", err)
110+
}
111+
}
104112
}
105113

106114
Ok(())

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ task: "blueprint_rendezvous"
5656
owned rendezvous tables that other subsystems consume
5757

5858

59+
task: "chicken_switches_watcher"
60+
watch db for chicken switch changes
61+
62+
5963
task: "crdb_node_id_collector"
6064
Collects node IDs of running CockroachDB zones
6165

@@ -260,6 +264,10 @@ task: "blueprint_rendezvous"
260264
owned rendezvous tables that other subsystems consume
261265

262266

267+
task: "chicken_switches_watcher"
268+
watch db for chicken switch changes
269+
270+
263271
task: "crdb_node_id_collector"
264272
Collects node IDs of running CockroachDB zones
265273

@@ -451,6 +459,10 @@ task: "blueprint_rendezvous"
451459
owned rendezvous tables that other subsystems consume
452460

453461

462+
task: "chicken_switches_watcher"
463+
watch db for chicken switch changes
464+
465+
454466
task: "crdb_node_id_collector"
455467
Collects node IDs of running CockroachDB zones
456468

dev-tools/omdb/tests/successes.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,10 @@ task: "blueprint_rendezvous"
268268
owned rendezvous tables that other subsystems consume
269269

270270

271+
task: "chicken_switches_watcher"
272+
watch db for chicken switch changes
273+
274+
271275
task: "crdb_node_id_collector"
272276
Collects node IDs of running CockroachDB zones
273277

@@ -543,6 +547,13 @@ task: "blueprint_rendezvous"
543547
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
544548
last completion reported error: no blueprint
545549

550+
task: "chicken_switches_watcher"
551+
configured period: every <REDACTED_DURATION>s
552+
currently executing: no
553+
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
554+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
555+
warning: unknown background task: "chicken_switches_watcher" (don't know how to interpret details: Object {"chicken_switches_updated": Bool(false)})
556+
546557
task: "crdb_node_id_collector"
547558
configured period: every <REDACTED_DURATION>m
548559
currently executing: no
@@ -1083,6 +1094,13 @@ task: "blueprint_rendezvous"
10831094
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
10841095
last completion reported error: no blueprint
10851096

1097+
task: "chicken_switches_watcher"
1098+
configured period: every <REDACTED_DURATION>s
1099+
currently executing: no
1100+
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
1101+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1102+
warning: unknown background task: "chicken_switches_watcher" (don't know how to interpret details: Object {"chicken_switches_updated": Bool(false)})
1103+
10861104
task: "crdb_node_id_collector"
10871105
configured period: every <REDACTED_DURATION>m
10881106
currently executing: no

docs/reconfigurator.adoc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ We're being cautious about rolling out that kind of automation. Instead, today,
175175

176176
`omdb` uses the Nexus internal API to do these things. Since this can only be done using `omdb`, Reconfigurator can really only be used by Oxide engineering and support, not customers.
177177

178-
The planner background task is currently disabled by default, but can be enabled by setting the Nexus configuration option `blueprints.disable_planner = false`. To get to the long term vision where the system is doing all this on its own in response to operator input, we'll need to get confidence that continually executing the planner will have no ill effects on working systems. This might involve more operational experience with it, more safeties, and tools for pausing execution, previewing what it _would_ do, etc.
178+
The planner background task is currently disabled by default, but can be enabled via `omdb nexus chicken-switches --planner-enabled`. To get to the long term vision where the system is doing all this on its own in response to operator input, we'll need to get confidence that continually executing the planner will have no ill effects on working systems. This might involve more operational experience with it, more safeties, and tools for pausing execution, previewing what it _would_ do, etc.
179179

180180
== Design patterns
181181

nexus-config/src/nexus_config.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -594,9 +594,6 @@ pub struct PhantomDiskConfig {
594594
#[serde_as]
595595
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
596596
pub struct BlueprintTasksConfig {
597-
/// background planner chicken switch
598-
pub disable_planner: bool,
599-
600597
/// period (in seconds) for periodic activations of the background task that
601598
/// reads the latest target blueprint from the database
602599
#[serde_as(as = "DurationSeconds<u64>")]
@@ -622,6 +619,11 @@ pub struct BlueprintTasksConfig {
622619
/// collects the node IDs of CockroachDB zones
623620
#[serde_as(as = "DurationSeconds<u64>")]
624621
pub period_secs_collect_crdb_node_ids: Duration,
622+
623+
/// period (in seconds) for periodic activations of the background task that
624+
/// reads chicken switches from the database
625+
#[serde_as(as = "DurationSeconds<u64>")]
626+
pub period_secs_load_chicken_switches: Duration,
625627
}
626628

627629
#[serde_as]
@@ -1079,12 +1081,12 @@ mod test {
10791081
physical_disk_adoption.period_secs = 30
10801082
decommissioned_disk_cleaner.period_secs = 30
10811083
phantom_disks.period_secs = 30
1082-
blueprints.disable_planner = true
10831084
blueprints.period_secs_load = 10
10841085
blueprints.period_secs_plan = 60
10851086
blueprints.period_secs_execute = 60
10861087
blueprints.period_secs_rendezvous = 300
10871088
blueprints.period_secs_collect_crdb_node_ids = 180
1089+
blueprints.period_secs_load_chicken_switches= 5
10881090
sync_service_zone_nat.period_secs = 30
10891091
switch_port_settings_manager.period_secs = 30
10901092
region_replacement.period_secs = 30
@@ -1247,13 +1249,14 @@ mod test {
12471249
period_secs: Duration::from_secs(30),
12481250
},
12491251
blueprints: BlueprintTasksConfig {
1250-
disable_planner: true,
12511252
period_secs_load: Duration::from_secs(10),
12521253
period_secs_plan: Duration::from_secs(60),
12531254
period_secs_execute: Duration::from_secs(60),
12541255
period_secs_collect_crdb_node_ids:
12551256
Duration::from_secs(180),
12561257
period_secs_rendezvous: Duration::from_secs(300),
1258+
period_secs_load_chicken_switches:
1259+
Duration::from_secs(5)
12571260
},
12581261
sync_service_zone_nat: SyncServiceZoneNatConfig {
12591262
period_secs: Duration::from_secs(30)
@@ -1396,12 +1399,12 @@ mod test {
13961399
physical_disk_adoption.period_secs = 30
13971400
decommissioned_disk_cleaner.period_secs = 30
13981401
phantom_disks.period_secs = 30
1399-
blueprints.disable_planner = true
14001402
blueprints.period_secs_load = 10
14011403
blueprints.period_secs_plan = 60
14021404
blueprints.period_secs_execute = 60
14031405
blueprints.period_secs_rendezvous = 300
14041406
blueprints.period_secs_collect_crdb_node_ids = 180
1407+
blueprints.period_secs_load_chicken_switches= 5
14051408
sync_service_zone_nat.period_secs = 30
14061409
switch_port_settings_manager.period_secs = 30
14071410
region_replacement.period_secs = 30
@@ -1424,6 +1427,7 @@ mod test {
14241427
alert_dispatcher.period_secs = 42
14251428
webhook_deliverator.period_secs = 43
14261429
sp_ereport_ingester.period_secs = 44
1430+
14271431
[default_region_allocation_strategy]
14281432
type = "random"
14291433
"##,

nexus/background-task-interface/src/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ pub struct BackgroundTasks {
4848
pub task_alert_dispatcher: Activator,
4949
pub task_webhook_deliverator: Activator,
5050
pub task_sp_ereport_ingester: Activator,
51+
pub task_chicken_switches_loader: Activator,
5152

5253
// Handles to activate background tasks that do not get used by Nexus
5354
// at-large. These background tasks are implementation details as far as

nexus/examples/config-second.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,12 @@ phantom_disks.period_secs = 30
118118
physical_disk_adoption.period_secs = 30
119119
support_bundle_collector.period_secs = 30
120120
decommissioned_disk_cleaner.period_secs = 60
121-
blueprints.disable_planner = true
122121
blueprints.period_secs_load = 10
123122
blueprints.period_secs_plan = 60
124123
blueprints.period_secs_execute = 60
125124
blueprints.period_secs_rendezvous = 300
126125
blueprints.period_secs_collect_crdb_node_ids = 180
126+
blueprints.period_secs_load_chicken_switches = 5
127127
sync_service_zone_nat.period_secs = 30
128128
switch_port_settings_manager.period_secs = 30
129129
region_replacement.period_secs = 30

nexus/examples/config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,12 @@ phantom_disks.period_secs = 30
104104
physical_disk_adoption.period_secs = 30
105105
support_bundle_collector.period_secs = 30
106106
decommissioned_disk_cleaner.period_secs = 60
107-
blueprints.disable_planner = true
108107
blueprints.period_secs_load = 10
109108
blueprints.period_secs_plan = 60
110109
blueprints.period_secs_execute = 60
111110
blueprints.period_secs_rendezvous = 300
112111
blueprints.period_secs_collect_crdb_node_ids = 180
112+
blueprints.period_secs_load_chicken_switches = 5
113113
sync_service_zone_nat.period_secs = 30
114114
switch_port_settings_manager.period_secs = 30
115115
region_replacement.period_secs = 30

nexus/src/app/background/init.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ use super::tasks::blueprint_execution;
9696
use super::tasks::blueprint_load;
9797
use super::tasks::blueprint_planner;
9898
use super::tasks::blueprint_rendezvous;
99+
use super::tasks::chicken_switches::ChickenSwitchesLoader;
99100
use super::tasks::crdb_node_id_collector;
100101
use super::tasks::decommissioned_disk_cleaner;
101102
use super::tasks::dns_config;
@@ -230,6 +231,7 @@ impl BackgroundTasksInitializer {
230231
task_alert_dispatcher: Activator::new(),
231232
task_webhook_deliverator: Activator::new(),
232233
task_sp_ereport_ingester: Activator::new(),
234+
task_chicken_switches_loader: Activator::new(),
233235

234236
task_internal_dns_propagation: Activator::new(),
235237
task_external_dns_propagation: Activator::new(),
@@ -306,6 +308,7 @@ impl BackgroundTasksInitializer {
306308
task_alert_dispatcher,
307309
task_webhook_deliverator,
308310
task_sp_ereport_ingester,
311+
task_chicken_switches_loader,
309312
// Add new background tasks here. Be sure to use this binding in a
310313
// call to `Driver::register()` below. That's what actually wires
311314
// up the Activator to the corresponding background task.
@@ -476,13 +479,26 @@ impl BackgroundTasksInitializer {
476479
inventory_watcher
477480
};
478481

482+
let chicken_switches_loader =
483+
ChickenSwitchesLoader::new(datastore.clone());
484+
let chicken_switches_watcher = chicken_switches_loader.watcher();
485+
driver.register(TaskDefinition {
486+
name: "chicken_switches_watcher",
487+
description: "watch db for chicken switch changes",
488+
period: config.blueprints.period_secs_load_chicken_switches,
489+
task_impl: Box::new(chicken_switches_loader),
490+
opctx: opctx.child(BTreeMap::new()),
491+
watchers: vec![],
492+
activator: task_chicken_switches_loader,
493+
});
494+
479495
// Background task: blueprint planner
480496
//
481497
// Replans on inventory collection and changes to the current
482498
// target blueprint.
483499
let blueprint_planner = blueprint_planner::BlueprintPlanner::new(
484500
datastore.clone(),
485-
config.blueprints.disable_planner,
501+
chicken_switches_watcher.clone(),
486502
inventory_watcher.clone(),
487503
rx_blueprint.clone(),
488504
);
@@ -496,6 +512,7 @@ impl BackgroundTasksInitializer {
496512
watchers: vec![
497513
Box::new(inventory_watcher.clone()),
498514
Box::new(rx_blueprint.clone()),
515+
Box::new(chicken_switches_watcher),
499516
],
500517
activator: task_blueprint_planner,
501518
});

nexus/src/app/background/tasks/blueprint_planner.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use nexus_db_queries::context::OpContext;
1212
use nexus_db_queries::db::DataStore;
1313
use nexus_reconfigurator_planning::planner::Planner;
1414
use nexus_reconfigurator_preparation::PlanningInputFromDb;
15+
use nexus_types::deployment::ReconfiguratorChickenSwitches;
1516
use nexus_types::deployment::{Blueprint, BlueprintTarget};
1617
use nexus_types::internal_api::background::BlueprintPlannerStatus;
1718
use omicron_common::api::external::LookupType;
@@ -24,7 +25,7 @@ use tokio::sync::watch::{self, Receiver, Sender};
2425
/// Background task that runs the update planner.
2526
pub struct BlueprintPlanner {
2627
datastore: Arc<DataStore>,
27-
disabled: bool,
28+
rx_chicken_switches: Receiver<ReconfiguratorChickenSwitches>,
2829
rx_inventory: Receiver<Option<CollectionUuid>>,
2930
rx_blueprint: Receiver<Option<Arc<(BlueprintTarget, Blueprint)>>>,
3031
tx_blueprint: Sender<Option<Arc<(BlueprintTarget, Blueprint)>>>,
@@ -33,12 +34,18 @@ pub struct BlueprintPlanner {
3334
impl BlueprintPlanner {
3435
pub fn new(
3536
datastore: Arc<DataStore>,
36-
disabled: bool,
37+
rx_chicken_switches: Receiver<ReconfiguratorChickenSwitches>,
3738
rx_inventory: Receiver<Option<CollectionUuid>>,
3839
rx_blueprint: Receiver<Option<Arc<(BlueprintTarget, Blueprint)>>>,
3940
) -> Self {
4041
let (tx_blueprint, _) = watch::channel(None);
41-
Self { datastore, disabled, rx_inventory, rx_blueprint, tx_blueprint }
42+
Self {
43+
datastore,
44+
rx_chicken_switches,
45+
rx_inventory,
46+
rx_blueprint,
47+
tx_blueprint,
48+
}
4249
}
4350

4451
pub fn watcher(
@@ -51,7 +58,8 @@ impl BlueprintPlanner {
5158
/// If it is different from the current target blueprint,
5259
/// save it and make it the current target.
5360
pub async fn plan(&mut self, opctx: &OpContext) -> BlueprintPlannerStatus {
54-
if self.disabled {
61+
let switches = self.rx_chicken_switches.borrow_and_update().clone();
62+
if !switches.planner_enabled {
5563
debug!(&opctx.log, "blueprint planning disabled, doing nothing");
5664
return BlueprintPlannerStatus::Disabled;
5765
}
@@ -251,6 +259,7 @@ mod test {
251259
use super::*;
252260
use crate::app::background::tasks::blueprint_load::TargetBlueprintLoader;
253261
use crate::app::background::tasks::inventory_collection::InventoryCollector;
262+
use nexus_inventory::now_db_precision;
254263
use nexus_test_utils_macros::nexus_test;
255264

256265
type ControlPlaneTestContext =
@@ -291,10 +300,18 @@ mod test {
291300
let rx_collector = collector.watcher();
292301
collector.activate(&opctx).await;
293302

303+
// Enable the planner
304+
let (_tx, chicken_switches_collector_rx) =
305+
watch::channel(ReconfiguratorChickenSwitches {
306+
version: 1,
307+
planner_enabled: true,
308+
time_modified: now_db_precision(),
309+
});
310+
294311
// Finally, spin up the planner background task.
295312
let mut planner = BlueprintPlanner::new(
296313
datastore.clone(),
297-
false,
314+
chicken_switches_collector_rx,
298315
rx_collector,
299316
rx_loader.clone(),
300317
);

0 commit comments

Comments
 (0)