Provide health check API on crater agents

Mark-Simulacrum · Mark-Simulacrum · commit 6e57f211135b · 2022-06-04T20:56:06.000-04:00
This is intended for the cloud runner (e.g., managed instance group) to replace
instances, though that logic isn't being added yet -- want to make sure we're
healthy first.
diff --git a/src/agent/mod.rs b/src/agent/mod.rs
@@ -15,8 +15,9 @@ use rustwide::Workspace;
 use std::collections::BTreeSet;
 use std::iter::FromIterator;
 use std::ops;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::thread;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 
 // Purge all the caches if the disk is more than 50% full.
 const PURGE_CACHES_THRESHOLD: f32 = 0.5;
@@ -95,6 +96,42 @@ impl Agent {
     }
 }
 
+static HEALTH_CHECK: AtomicBool = AtomicBool::new(false);
+
+// Should be called at least once every 5 minutes, otherwise instance is
+// replaced.
+pub fn set_healthy() {
+    HEALTH_CHECK.store(true, Ordering::SeqCst);
+}
+
+fn health_thread() {
+    std::thread::spawn(move || {
+        let mut last_check = Instant::now();
+
+        let listener = std::net::TcpListener::bind("0.0.0.0:4343").unwrap();
+        loop {
+            // Accept a connection...
+            drop(listener.accept());
+
+            // Then check whether we should still be healthy. If not, we simply
+            // drop the listening socket by breaking out of the loop, meaning
+            // that we'll stop responding as healthy to future connects.
+            //
+            // A build has a maximum timeout of 15 minutes in rustwide, so we
+            // currently expect checkpoints at least that often. It likely makes
+            // sense for us to be more eager, but ultimately crater runtimes are
+            // long enough that 15 minutes on one builder hopefully won't matter
+            // too much.
+            if last_check.elapsed() > Duration::from_secs(15 * 60) {
+                last_check = Instant::now();
+                if !HEALTH_CHECK.swap(false, Ordering::SeqCst) {
+                    break;
+                }
+            }
+        }
+    });
+}
+
 fn run_heartbeat(url: &str, token: &str) {
     let api = AgentApi::new(url, token);
 
@@ -149,6 +186,7 @@ pub fn run(
     let db = results::ResultsUploader::new(&agent.api);
 
     run_heartbeat(url, token);
+    health_thread();
 
     let mut past_experiment = None;
     loop {
diff --git a/src/runner/mod.rs b/src/runner/mod.rs
@@ -78,6 +78,8 @@ pub fn run_ex<DB: WriteResults + Sync>(
         i += 1;
     }
 
+    crate::agent::set_healthy();
+
     info!("uninstalling toolchains...");
     // Clean out all the toolchains currently installed. This minimizes the
     // amount of disk space used by the base system, letting the task execution
diff --git a/src/runner/worker.rs b/src/runner/worker.rs
@@ -55,6 +55,10 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
 
     fn run_task(&self, task: &Task) -> Result<(), (failure::Error, TestResult)> {
         info!("running task: {:?}", task);
+
+        // If we're running a task, we call ourselves healthy.
+        crate::agent::set_healthy();
+
         let res = task.run(
             self.config,
             self.workspace,

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,8 @@ pub fn run_ex<DB: WriteResults + Sync>(`
`78`	`78`	`i += 1;`
`79`	`79`	`}`
`80`	`80`
	`81`	`+ crate::agent::set_healthy();`
	`82`	`+`
`81`	`83`	`info!("uninstalling toolchains...");`
`82`	`84`	`// Clean out all the toolchains currently installed. This minimizes the`
`83`	`85`	`// amount of disk space used by the base system, letting the task execution`