Skip to content

Commit 02eafe8

Browse files
committed
Auto merge of #637 - Mark-Simulacrum:health, r=Mark-Simulacrum
Health checks for crater agents Our heartbeats are a push API and are OK, but we want an API that's usable for managed instance groups on GCP and their equivalents elsewhere to more finely measure crater being up. Currently that's a listening TCP socket (that just accepts and terminates connections immediately).
2 parents 244d878 + 6e57f21 commit 02eafe8

File tree

5 files changed

+49
-5
lines changed

5 files changed

+49
-5
lines changed

src/agent/mod.rs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ use rustwide::Workspace;
1515
use std::collections::BTreeSet;
1616
use std::iter::FromIterator;
1717
use std::ops;
18+
use std::sync::atomic::{AtomicBool, Ordering};
1819
use std::thread;
19-
use std::time::Duration;
20+
use std::time::{Duration, Instant};
2021

2122
// Purge all the caches if the disk is more than 50% full.
2223
const PURGE_CACHES_THRESHOLD: f32 = 0.5;
@@ -95,6 +96,42 @@ impl Agent {
9596
}
9697
}
9798

99+
static HEALTH_CHECK: AtomicBool = AtomicBool::new(false);
100+
101+
// Should be called at least once every 5 minutes, otherwise instance is
102+
// replaced.
103+
pub fn set_healthy() {
104+
HEALTH_CHECK.store(true, Ordering::SeqCst);
105+
}
106+
107+
fn health_thread() {
108+
std::thread::spawn(move || {
109+
let mut last_check = Instant::now();
110+
111+
let listener = std::net::TcpListener::bind("0.0.0.0:4343").unwrap();
112+
loop {
113+
// Accept a connection...
114+
drop(listener.accept());
115+
116+
// Then check whether we should still be healthy. If not, we simply
117+
// drop the listening socket by breaking out of the loop, meaning
118+
// that we'll stop responding as healthy to future connects.
119+
//
120+
// A build has a maximum timeout of 15 minutes in rustwide, so we
121+
// currently expect checkpoints at least that often. It likely makes
122+
// sense for us to be more eager, but ultimately crater runtimes are
123+
// long enough that 15 minutes on one builder hopefully won't matter
124+
// too much.
125+
if last_check.elapsed() > Duration::from_secs(15 * 60) {
126+
last_check = Instant::now();
127+
if !HEALTH_CHECK.swap(false, Ordering::SeqCst) {
128+
break;
129+
}
130+
}
131+
}
132+
});
133+
}
134+
98135
fn run_heartbeat(url: &str, token: &str) {
99136
let api = AgentApi::new(url, token);
100137

@@ -149,6 +186,7 @@ pub fn run(
149186
let db = results::ResultsUploader::new(&agent.api);
150187

151188
run_heartbeat(url, token);
189+
health_thread();
152190

153191
let mut past_experiment = None;
154192
loop {

src/experiments.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ impl Experiment {
500500
}
501501
}
502502

503-
pub fn handle_failure(&mut self, db: &Database, agent: &Assignee) -> Fallible<()> {
503+
pub fn clear_agent_progress(&mut self, db: &Database, agent: &str) -> Fallible<()> {
504504
// Mark all the running crates from this agent as queued (so that they
505505
// run again)
506506
db.execute(
@@ -514,7 +514,7 @@ impl Experiment {
514514
&Status::Queued.to_string(),
515515
&self.name,
516516
&Status::Running.to_string(),
517-
&agent.to_string(),
517+
&Assignee::Agent(agent.to_string()).to_string(),
518518
],
519519
)?;
520520
Ok(())
@@ -1096,7 +1096,7 @@ mod tests {
10961096
.get_uncompleted_crates(&db, &config, &agent1)
10971097
.unwrap()
10981098
.is_empty());
1099-
ex.handle_failure(&db, &agent1).unwrap();
1099+
ex.clear_agent_progress(&db, "agent-1").unwrap();
11001100
assert!(Experiment::next(&db, &agent1).unwrap().is_some());
11011101
assert_eq!(ex.status, Status::Running);
11021102
assert!(!ex

src/runner/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ pub fn run_ex<DB: WriteResults + Sync>(
7878
i += 1;
7979
}
8080

81+
crate::agent::set_healthy();
82+
8183
info!("uninstalling toolchains...");
8284
// Clean out all the toolchains currently installed. This minimizes the
8385
// amount of disk space used by the base system, letting the task execution

src/runner/worker.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
5555

5656
fn run_task(&self, task: &Task) -> Result<(), (failure::Error, TestResult)> {
5757
info!("running task: {:?}", task);
58+
59+
// If we're running a task, we call ourselves healthy.
60+
crate::agent::set_healthy();
61+
5862
let res = task.run(
5963
self.config,
6064
self.workspace,

src/server/routes/agent.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ fn endpoint_error(
323323
.ok_or_else(|| err_msg("no experiment run by this agent"))?;
324324

325325
data.metrics.record_error(&auth.name, &ex.name);
326-
ex.handle_failure(&data.db, &Assignee::Agent(auth.name))?;
326+
ex.clear_agent_progress(&data.db, &auth.name)?;
327327

328328
Ok(ApiResponse::Success { result: true }.into_response()?)
329329
}

0 commit comments

Comments
 (0)