Skip to content

Commit 03856c6

Browse files
authored
Improve failure mode in omicron-dev tests that fail (#3433)
This fixes a difficult failure mode I ran into recently. Some of the tests in `omicron-dev` spawn CLI programs, such as `cockroach`. These are put into a progress group so that, on failure, the entire group can be signaled and reaped. However, these tests were previously structured so that the group is only killed on a successful test run; an unwrap or other panic early would exit before the shutdown code was run. This leaves the group running and the test hangs. This PR fixes a couple of those tests, by running the bulk of test code we care about in a separate fallible future. We run it; store the result; kill and reap the child processes; and then assert that the body ran successfully.
1 parent efc2566 commit 03856c6

File tree

1 file changed

+105
-79
lines changed

1 file changed

+105
-79
lines changed

dev-tools/tests/test_omicron_dev.rs

Lines changed: 105 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -289,66 +289,79 @@ async fn test_db_run() {
289289
let exec =
290290
Exec::cmd("bash").arg("-c").arg(cmdstr).stderr(Redirection::Merge);
291291
let dbrun = run_db_run(exec, true);
292-
let (client, connection) = dbrun
293-
.listen_config
294-
.connect(tokio_postgres::NoTls)
295-
.await
296-
.expect("failed to connect to newly setup database");
297-
let conn_task = tokio::spawn(connection);
298-
299-
assert!(has_omicron_schema(&client).await);
300-
301-
// Now run db-populate. It should fail because the database is already
302-
// populated.
303-
eprintln!("running db-populate");
304-
let populate_result = Exec::cmd(&cmd_path)
305-
.arg("db-populate")
306-
.arg("--database-url")
307-
.arg(&dbrun.listen_config_url)
308-
.stdout(Redirection::Pipe)
309-
.stderr(Redirection::Pipe)
310-
.capture()
311-
.expect("failed to run db-populate");
312-
eprintln!("exit status: {:?}", populate_result.exit_status);
313-
eprintln!("stdout: {:?}", populate_result.stdout_str());
314-
eprintln!("stdout: {:?}", populate_result.stderr_str());
315-
assert!(matches!(populate_result.exit_status, ExitStatus::Exited(1)));
316-
assert!(populate_result
317-
.stderr_str()
318-
.contains("database \"omicron\" already exists"));
319-
assert!(has_omicron_schema(&client).await);
320-
321-
// Try again, but with the --wipe flag.
322-
eprintln!("running db-populate --wipe");
323-
let populate_result = Exec::cmd(&cmd_path)
324-
.arg("db-populate")
325-
.arg("--wipe")
326-
.arg("--database-url")
327-
.arg(&dbrun.listen_config_url)
328-
.capture()
329-
.expect("failed to run db-populate");
330-
assert!(matches!(populate_result.exit_status, ExitStatus::Exited(0)));
331-
assert!(has_omicron_schema(&client).await);
332-
333-
// Now run db-wipe. This should work.
334-
eprintln!("running db-wipe");
335-
let wipe_result = Exec::cmd(&cmd_path)
336-
.arg("db-wipe")
337-
.arg("--database-url")
338-
.arg(&dbrun.listen_config_url)
339-
.capture()
340-
.expect("failed to run db-wipe");
341-
assert!(matches!(wipe_result.exit_status, ExitStatus::Exited(0)));
342-
assert!(!has_omicron_schema(&client).await);
343-
344-
// The rest of the populate()/wipe() behavior is tested elsewhere.
345-
346-
drop(client);
347-
conn_task
348-
.await
349-
.expect("failed to join on connection")
350-
.expect("connection failed with an error");
351-
eprintln!("cleaned up connection");
292+
let test_task = async {
293+
let (client, connection) = dbrun
294+
.listen_config
295+
.connect(tokio_postgres::NoTls)
296+
.await
297+
.context("failed to connect to newly setup database")?;
298+
let conn_task = tokio::spawn(connection);
299+
300+
anyhow::ensure!(has_omicron_schema(&client).await);
301+
302+
// Now run db-populate. It should fail because the database is already
303+
// populated.
304+
eprintln!("running db-populate");
305+
let populate_result = Exec::cmd(&cmd_path)
306+
.arg("db-populate")
307+
.arg("--database-url")
308+
.arg(&dbrun.listen_config_url)
309+
.stdout(Redirection::Pipe)
310+
.stderr(Redirection::Pipe)
311+
.capture()
312+
.context("failed to run db-populate")?;
313+
eprintln!("exit status: {:?}", populate_result.exit_status);
314+
eprintln!("stdout: {:?}", populate_result.stdout_str());
315+
eprintln!("stdout: {:?}", populate_result.stderr_str());
316+
anyhow::ensure!(matches!(
317+
populate_result.exit_status,
318+
ExitStatus::Exited(1)
319+
));
320+
anyhow::ensure!(populate_result
321+
.stderr_str()
322+
.contains("database \"omicron\" already exists"),);
323+
anyhow::ensure!(has_omicron_schema(&client).await);
324+
325+
// Try again, but with the --wipe flag.
326+
eprintln!("running db-populate --wipe");
327+
let populate_result = Exec::cmd(&cmd_path)
328+
.arg("db-populate")
329+
.arg("--wipe")
330+
.arg("--database-url")
331+
.arg(&dbrun.listen_config_url)
332+
.capture()
333+
.context("failed to run db-populate")?;
334+
anyhow::ensure!(matches!(
335+
populate_result.exit_status,
336+
ExitStatus::Exited(0)
337+
));
338+
anyhow::ensure!(has_omicron_schema(&client).await);
339+
340+
// Now run db-wipe. This should work.
341+
eprintln!("running db-wipe");
342+
let wipe_result = Exec::cmd(&cmd_path)
343+
.arg("db-wipe")
344+
.arg("--database-url")
345+
.arg(&dbrun.listen_config_url)
346+
.capture()
347+
.context("failed to run db-wipe")?;
348+
anyhow::ensure!(matches!(
349+
wipe_result.exit_status,
350+
ExitStatus::Exited(0)
351+
));
352+
anyhow::ensure!(!has_omicron_schema(&client).await);
353+
354+
// The rest of the populate()/wipe() behavior is tested elsewhere.
355+
356+
drop(client);
357+
conn_task
358+
.await
359+
.context("failed to join on connection")?
360+
.context("connection failed with an error")?;
361+
eprintln!("cleaned up connection");
362+
Ok(())
363+
};
364+
let res = test_task.await;
352365

353366
// Figure out what process group our child processes are in. (That won't be
354367
// the child's pid because the immediate shell will be in our process group,
@@ -369,6 +382,7 @@ async fn test_db_run() {
369382
);
370383
eprintln!("wait result: {:?}", wait);
371384
assert!(matches!(wait, subprocess::ExitStatus::Exited(0)));
385+
res.expect("test task failed");
372386
}
373387

374388
// Exercises the normal use case of `omicron-dev run-all`: everything starts up,
@@ -388,25 +402,34 @@ async fn test_run_all() {
388402
Exec::cmd("bash").arg("-c").arg(cmdstr).stderr(Redirection::Merge);
389403
let runall = run_run_all(exec);
390404

391-
// Make sure we can connect to CockroachDB.
392-
let (client, connection) = runall
393-
.postgres_config
394-
.connect(tokio_postgres::NoTls)
395-
.await
396-
.expect("failed to connect to newly setup database");
397-
let conn_task = tokio::spawn(connection);
398-
assert!(has_omicron_schema(&client).await);
399-
drop(client);
400-
conn_task
401-
.await
402-
.expect("failed to join on connection")
403-
.expect("connection failed with an error");
404-
eprintln!("cleaned up connection");
405-
406-
// Make sure we can connect to Nexus.
407-
let client =
408-
oxide_client::Client::new(&format!("http://{}", runall.external_url));
409-
let _ = client.logout().send().await.unwrap();
405+
let test_task = async {
406+
// Make sure we can connect to CockroachDB.
407+
let (client, connection) = runall
408+
.postgres_config
409+
.connect(tokio_postgres::NoTls)
410+
.await
411+
.context("failed to connect to newly setup database")?;
412+
let conn_task = tokio::spawn(connection);
413+
anyhow::ensure!(has_omicron_schema(&client).await);
414+
drop(client);
415+
conn_task
416+
.await
417+
.context("failed to join on connection")?
418+
.context("connection failed with an error")?;
419+
eprintln!("cleaned up connection");
420+
421+
// Make sure we can connect to Nexus.
422+
let client = oxide_client::Client::new(&format!(
423+
"http://{}",
424+
runall.external_url
425+
));
426+
let _ =
427+
client.logout().send().await.context(
428+
"Unexpectedly failed to reach Nexus at logout endpoint",
429+
)?;
430+
Ok(())
431+
};
432+
let res = test_task.await;
410433

411434
// Figure out what process group our child processes are in. (That won't be
412435
// the child's pid because the immediate shell will be in our process group,
@@ -427,6 +450,9 @@ async fn test_run_all() {
427450
);
428451
eprintln!("wait result: {:?}", wait);
429452
assert!(matches!(wait, subprocess::ExitStatus::Exited(0)));
453+
454+
// Unwrap the caught errors we are actually trying to test.
455+
res.expect("failed to run test");
430456
}
431457

432458
// Exercises the unusual case of `omicron-dev db-run` where the database shuts

0 commit comments

Comments
 (0)