Auto merge of #637 - Mark-Simulacrum:health, r=Mark-Simulacrum

bors · bors · commit 02eafe8e6574 · 2022-06-05T01:11:48.000Z
Health checks for crater agents

Our heartbeats are a push API and are OK, but we want an API that's usable for managed instance groups on GCP and their equivalents elsewhere to more finely measure crater being up. Currently that's a listening TCP socket (that just accepts and terminates connections immediately).
diff --git a/src/agent/mod.rs b/src/agent/mod.rs
@@ -15,8 +15,9 @@ use rustwide::Workspace;
 use std::collections::BTreeSet;
 use std::iter::FromIterator;
 use std::ops;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::thread;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 
 // Purge all the caches if the disk is more than 50% full.
 const PURGE_CACHES_THRESHOLD: f32 = 0.5;
@@ -95,6 +96,42 @@ impl Agent {
     }
 }
 
+static HEALTH_CHECK: AtomicBool = AtomicBool::new(false);
+
+// Should be called at least once every 5 minutes, otherwise instance is
+// replaced.
+pub fn set_healthy() {
+    HEALTH_CHECK.store(true, Ordering::SeqCst);
+}
+
+fn health_thread() {
+    std::thread::spawn(move || {
+        let mut last_check = Instant::now();
+
+        let listener = std::net::TcpListener::bind("0.0.0.0:4343").unwrap();
+        loop {
+            // Accept a connection...
+            drop(listener.accept());
+
+            // Then check whether we should still be healthy. If not, we simply
+            // drop the listening socket by breaking out of the loop, meaning
+            // that we'll stop responding as healthy to future connects.
+            //
+            // A build has a maximum timeout of 15 minutes in rustwide, so we
+            // currently expect checkpoints at least that often. It likely makes
+            // sense for us to be more eager, but ultimately crater runtimes are
+            // long enough that 15 minutes on one builder hopefully won't matter
+            // too much.
+            if last_check.elapsed() > Duration::from_secs(15 * 60) {
+                last_check = Instant::now();
+                if !HEALTH_CHECK.swap(false, Ordering::SeqCst) {
+                    break;
+                }
+            }
+        }
+    });
+}
+
 fn run_heartbeat(url: &str, token: &str) {
     let api = AgentApi::new(url, token);
 
@@ -149,6 +186,7 @@ pub fn run(
     let db = results::ResultsUploader::new(&agent.api);
 
     run_heartbeat(url, token);
+    health_thread();
 
     let mut past_experiment = None;
     loop {
diff --git a/src/experiments.rs b/src/experiments.rs
@@ -500,7 +500,7 @@ impl Experiment {
         }
     }
 
-    pub fn handle_failure(&mut self, db: &Database, agent: &Assignee) -> Fallible<()> {
+    pub fn clear_agent_progress(&mut self, db: &Database, agent: &str) -> Fallible<()> {
         // Mark all the running crates from this agent as queued (so that they
         // run again)
         db.execute(
@@ -514,7 +514,7 @@ impl Experiment {
                 &Status::Queued.to_string(),
                 &self.name,
                 &Status::Running.to_string(),
-                &agent.to_string(),
+                &Assignee::Agent(agent.to_string()).to_string(),
             ],
         )?;
         Ok(())
@@ -1096,7 +1096,7 @@ mod tests {
             .get_uncompleted_crates(&db, &config, &agent1)
             .unwrap()
             .is_empty());
-        ex.handle_failure(&db, &agent1).unwrap();
+        ex.clear_agent_progress(&db, "agent-1").unwrap();
         assert!(Experiment::next(&db, &agent1).unwrap().is_some());
         assert_eq!(ex.status, Status::Running);
         assert!(!ex
diff --git a/src/runner/mod.rs b/src/runner/mod.rs
@@ -78,6 +78,8 @@ pub fn run_ex<DB: WriteResults + Sync>(
         i += 1;
     }
 
+    crate::agent::set_healthy();
+
     info!("uninstalling toolchains...");
     // Clean out all the toolchains currently installed. This minimizes the
     // amount of disk space used by the base system, letting the task execution
diff --git a/src/runner/worker.rs b/src/runner/worker.rs
@@ -55,6 +55,10 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
 
     fn run_task(&self, task: &Task) -> Result<(), (failure::Error, TestResult)> {
         info!("running task: {:?}", task);
+
+        // If we're running a task, we call ourselves healthy.
+        crate::agent::set_healthy();
+
         let res = task.run(
             self.config,
             self.workspace,
diff --git a/src/server/routes/agent.rs b/src/server/routes/agent.rs
@@ -323,7 +323,7 @@ fn endpoint_error(
         .ok_or_else(|| err_msg("no experiment run by this agent"))?;
 
     data.metrics.record_error(&auth.name, &ex.name);
-    ex.handle_failure(&data.db, &Assignee::Agent(auth.name))?;
+    ex.clear_agent_progress(&data.db, &auth.name)?;
 
     Ok(ApiResponse::Success { result: true }.into_response()?)
 }

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,8 @@ pub fn run_ex<DB: WriteResults + Sync>(`
`78`	`78`	`i += 1;`
`79`	`79`	`}`
`80`	`80`
	`81`	`+ crate::agent::set_healthy();`
	`82`	`+`
`81`	`83`	`info!("uninstalling toolchains...");`
`82`	`84`	`// Clean out all the toolchains currently installed. This minimizes the`
`83`	`85`	`// amount of disk space used by the base system, letting the task execution`
Original file line number	Diff line number	Diff line change
`@@ -323,7 +323,7 @@ fn endpoint_error(`
`323`	`323`	`.ok_or_else(\|\| err_msg("no experiment run by this agent"))?;`
`324`	`324`
`325`	`325`	`data.metrics.record_error(&auth.name, &ex.name);`
`326`		`- ex.handle_failure(&data.db, &Assignee::Agent(auth.name))?;`
	`326`	`+ ex.clear_agent_progress(&data.db, &auth.name)?;`
`327`	`327`
`328`	`328`	`Ok(ApiResponse::Success { result: true }.into_response()?)`
`329`	`329`	`}`