Skip to content

Commit ae1099a

Browse files
Recover gracefully from agent failure
Previously, any agent reporting an error would cause an experiment to come to a hard stop. This is noisy (leads to a user-visible report) and, at least lately, not actually representative of a terminal failure. Instead, we take the approach of re-queueing any crates assigned to that agent and logging an error on the server host specifying which agent failed. As the error message is now not typically posted publicly (and no longer needs to be short for being in a GitHub comment), we will also have more opportunity to add extra metadata over time to that message (in future PRs) enabling diagnosing and fixing these errors. In the future, we will likely also want some tracking/reporting of these failures such that hard errors across all agents lead to alerting the infra team, but this may be best done through Prometheus metrics rather than GitHub "end-user" visible comments (which are likely not the best way to reach infra team members regardless).
1 parent 599c5d7 commit ae1099a

File tree

2 files changed

+22
-34
lines changed

2 files changed

+22
-34
lines changed

src/experiments.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -485,8 +485,9 @@ impl Experiment {
485485
}
486486
}
487487

488-
pub fn report_failure(&mut self, db: &Database, agent: &Assignee) -> Fallible<()> {
489-
// Mark all the running crates from this agent as failed as well if the experiment failed
488+
pub fn handle_failure(&mut self, db: &Database, agent: &Assignee) -> Fallible<()> {
489+
// Mark all the running crates from this agent as queued (so that they
490+
// run again)
490491
db.execute(
491492
"
492493
UPDATE experiment_crates
@@ -495,14 +496,13 @@ impl Experiment {
495496
AND assigned_to = ?4
496497
",
497498
&[
498-
&Status::Failed.to_string(),
499+
&Status::Queued.to_string(),
499500
&self.name,
500501
&Status::Running.to_string(),
501502
&agent.to_string(),
502503
],
503504
)?;
504-
505-
self.set_status(db, Status::Failed)
505+
Ok(())
506506
}
507507

508508
pub fn set_status(&mut self, db: &Database, status: Status) -> Fallible<()> {
@@ -1063,6 +1063,8 @@ mod tests {
10631063
assert_eq!(uncompleted_crates.len(), 0);
10641064
}
10651065

1066+
// A failure is handled by re-queueing any running crates for a given agent,
1067+
// to be picked up by the next agent to ask for them.
10661068
#[test]
10671069
fn test_failed_experiment() {
10681070
let db = Database::temp().unwrap();
@@ -1079,9 +1081,12 @@ mod tests {
10791081
.get_uncompleted_crates(&db, &config, &agent1)
10801082
.unwrap()
10811083
.is_empty());
1082-
ex.report_failure(&db, &agent1).unwrap();
1083-
assert!(!Experiment::next(&db, &agent1).unwrap().is_some());
1084-
assert_eq!(ex.status, Status::Failed);
1085-
assert!(ex.get_running_crates(&db, &agent1).unwrap().is_empty());
1084+
ex.handle_failure(&db, &agent1).unwrap();
1085+
assert!(Experiment::next(&db, &agent1).unwrap().is_some());
1086+
assert_eq!(ex.status, Status::Running);
1087+
assert!(!ex
1088+
.get_uncompleted_crates(&db, &config, &agent1)
1089+
.unwrap()
1090+
.is_empty());
10861091
}
10871092
}

src/server/routes/agent.rs

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ pub fn routes(
7575
.and(warp::path::end())
7676
.and(warp::body::json())
7777
.and(mutex_filter)
78-
.and(github_data_filter)
7978
.and(auth_filter(data, TokenType::Agent))
8079
.map(endpoint_error);
8180

@@ -197,38 +196,22 @@ fn endpoint_heartbeat(data: Arc<Data>, auth: AuthDetails) -> Fallible<Response<B
197196
fn endpoint_error(
198197
error: ExperimentData<HashMap<String, String>>,
199198
mutex: Arc<Mutex<Data>>,
200-
github_data: Option<Arc<GithubData>>,
201199
auth: AuthDetails,
202200
) -> Fallible<Response<Body>> {
201+
log::error!(
202+
"agent {} failed while running {}: {:?}",
203+
auth.name,
204+
error.experiment_name,
205+
error.data.get("error")
206+
);
207+
203208
let data = mutex.lock().unwrap();
204209
let mut ex = Experiment::get(&data.db, &error.experiment_name)?
205210
.ok_or_else(|| err_msg("no experiment run by this agent"))?;
206211

207212
//also set status to failed
208-
ex.report_failure(&data.db, &Assignee::Agent(auth.name))?;
213+
ex.handle_failure(&data.db, &Assignee::Agent(auth.name))?;
209214

210-
if let Some(github_data) = github_data.as_ref() {
211-
if let Some(ref github_issue) = ex.github_issue {
212-
Message::new()
213-
.line(
214-
"rotating_light",
215-
format!(
216-
"Experiment **`{}`** has encountered an error: {}",
217-
ex.name,
218-
error.data.get("error").unwrap_or(&String::from("no error")),
219-
),
220-
)
221-
.line(
222-
"hammer_and_wrench",
223-
"If the error is fixed use the `retry` command.",
224-
)
225-
.note(
226-
"sos",
227-
"Can someone from the infra team check in on this? @rust-lang/infra",
228-
)
229-
.send(&github_issue.api_url, &data, github_data)?;
230-
}
231-
}
232215
Ok(ApiResponse::Success { result: true }.into_response()?)
233216
}
234217

0 commit comments

Comments
 (0)