Skip to content

Commit 6e57f21

Browse files
Provide health check API on crater agents
This is intended for the cloud runner (e.g., managed instance group) to replace instances, though that logic isn't being added yet -- want to make sure we're healthy first.
1 parent 4ffb142 commit 6e57f21

File tree

3 files changed

+45
-1
lines changed

3 files changed

+45
-1
lines changed

src/agent/mod.rs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ use rustwide::Workspace;
1515
use std::collections::BTreeSet;
1616
use std::iter::FromIterator;
1717
use std::ops;
18+
use std::sync::atomic::{AtomicBool, Ordering};
1819
use std::thread;
19-
use std::time::Duration;
20+
use std::time::{Duration, Instant};
2021

2122
// Purge all the caches if the disk is more than 50% full.
2223
const PURGE_CACHES_THRESHOLD: f32 = 0.5;
@@ -95,6 +96,42 @@ impl Agent {
9596
}
9697
}
9798

99+
static HEALTH_CHECK: AtomicBool = AtomicBool::new(false);
100+
101+
// Should be called at least once every 5 minutes, otherwise instance is
102+
// replaced.
103+
pub fn set_healthy() {
104+
HEALTH_CHECK.store(true, Ordering::SeqCst);
105+
}
106+
107+
fn health_thread() {
108+
std::thread::spawn(move || {
109+
let mut last_check = Instant::now();
110+
111+
let listener = std::net::TcpListener::bind("0.0.0.0:4343").unwrap();
112+
loop {
113+
// Accept a connection...
114+
drop(listener.accept());
115+
116+
// Then check whether we should still be healthy. If not, we simply
117+
// drop the listening socket by breaking out of the loop, meaning
118+
// that we'll stop responding as healthy to future connects.
119+
//
120+
// A build has a maximum timeout of 15 minutes in rustwide, so we
121+
// currently expect checkpoints at least that often. It likely makes
122+
// sense for us to be more eager, but ultimately crater runtimes are
123+
// long enough that 15 minutes on one builder hopefully won't matter
124+
// too much.
125+
if last_check.elapsed() > Duration::from_secs(15 * 60) {
126+
last_check = Instant::now();
127+
if !HEALTH_CHECK.swap(false, Ordering::SeqCst) {
128+
break;
129+
}
130+
}
131+
}
132+
});
133+
}
134+
98135
fn run_heartbeat(url: &str, token: &str) {
99136
let api = AgentApi::new(url, token);
100137

@@ -149,6 +186,7 @@ pub fn run(
149186
let db = results::ResultsUploader::new(&agent.api);
150187

151188
run_heartbeat(url, token);
189+
health_thread();
152190

153191
let mut past_experiment = None;
154192
loop {

src/runner/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ pub fn run_ex<DB: WriteResults + Sync>(
7878
i += 1;
7979
}
8080

81+
crate::agent::set_healthy();
82+
8183
info!("uninstalling toolchains...");
8284
// Clean out all the toolchains currently installed. This minimizes the
8385
// amount of disk space used by the base system, letting the task execution

src/runner/worker.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
5555

5656
fn run_task(&self, task: &Task) -> Result<(), (failure::Error, TestResult)> {
5757
info!("running task: {:?}", task);
58+
59+
// If we're running a task, we call ourselves healthy.
60+
crate::agent::set_healthy();
61+
5862
let res = task.run(
5963
self.config,
6064
self.workspace,

0 commit comments

Comments
 (0)