Skip to content

Commit 2e0a518

Browse files
committed
Auto merge of #602 - Mark-Simulacrum:recover-from-error, r=pietroalbini
Recover gracefully from agent failure Previously, any agent reporting an error would cause an experiment to come to a hard stop. This is noisy (leads to a user-visible report) and, at least lately, not actually representative of a terminal failure. Instead, we take the approach of re-queueing any crates assigned to that agent and logging an error on the server host specifying which agent failed. As the error message is now not typically posted publicly (and no longer needs to be short for being in a GitHub comment), we will also have more opportunity to add extra metadata over time to that message (in future PRs) enabling diagnosing and fixing these errors. In the future, we will likely also want some tracking/reporting of these failures such that hard errors across all agents lead to alerting the infra team, but this may be best done through Prometheus metrics rather than GitHub "end-user" visible comments (which are likely not the best way to reach infra team members regardless). r? `@pietroalbini`
2 parents 599c5d7 + d0b7ab9 commit 2e0a518

File tree

6 files changed

+44
-44
lines changed

6 files changed

+44
-44
lines changed

src/experiments.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -485,8 +485,9 @@ impl Experiment {
485485
}
486486
}
487487

488-
pub fn report_failure(&mut self, db: &Database, agent: &Assignee) -> Fallible<()> {
489-
// Mark all the running crates from this agent as failed as well if the experiment failed
488+
pub fn handle_failure(&mut self, db: &Database, agent: &Assignee) -> Fallible<()> {
489+
// Mark all the running crates from this agent as queued (so that they
490+
// run again)
490491
db.execute(
491492
"
492493
UPDATE experiment_crates
@@ -495,14 +496,13 @@ impl Experiment {
495496
AND assigned_to = ?4
496497
",
497498
&[
498-
&Status::Failed.to_string(),
499+
&Status::Queued.to_string(),
499500
&self.name,
500501
&Status::Running.to_string(),
501502
&agent.to_string(),
502503
],
503504
)?;
504-
505-
self.set_status(db, Status::Failed)
505+
Ok(())
506506
}
507507

508508
pub fn set_status(&mut self, db: &Database, status: Status) -> Fallible<()> {
@@ -1063,6 +1063,8 @@ mod tests {
10631063
assert_eq!(uncompleted_crates.len(), 0);
10641064
}
10651065

1066+
// A failure is handled by re-queueing any running crates for a given agent,
1067+
// to be picked up by the next agent to ask for them.
10661068
#[test]
10671069
fn test_failed_experiment() {
10681070
let db = Database::temp().unwrap();
@@ -1079,9 +1081,12 @@ mod tests {
10791081
.get_uncompleted_crates(&db, &config, &agent1)
10801082
.unwrap()
10811083
.is_empty());
1082-
ex.report_failure(&db, &agent1).unwrap();
1083-
assert!(!Experiment::next(&db, &agent1).unwrap().is_some());
1084-
assert_eq!(ex.status, Status::Failed);
1085-
assert!(ex.get_running_crates(&db, &agent1).unwrap().is_empty());
1084+
ex.handle_failure(&db, &agent1).unwrap();
1085+
assert!(Experiment::next(&db, &agent1).unwrap().is_some());
1086+
assert_eq!(ex.status, Status::Running);
1087+
assert!(!ex
1088+
.get_uncompleted_crates(&db, &config, &agent1)
1089+
.unwrap()
1090+
.is_empty());
10861091
}
10871092
}

src/report/analyzer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ fn analyze_detailed(toolchain: usize, crates: Vec<CrateResult>) -> ReportCrates
4040
let mut root = Vec::new();
4141
for krate in crates {
4242
if let BuildFail(FailureReason::DependsOn(ref deps)) =
43-
(&krate.runs[toolchain]).as_ref().unwrap().res
43+
krate.runs[toolchain].as_ref().unwrap().res
4444
{
4545
for dep in deps {
4646
tree.entry(dep.clone())

src/report/markdown.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct ResultsContext<'a> {
3131
}
3232

3333
fn write_crate(
34-
mut rendered: &mut String,
34+
rendered: &mut String,
3535
krate: &CrateResult,
3636
comparison: Comparison,
3737
is_child: bool,
@@ -76,7 +76,7 @@ fn write_crate(
7676
};
7777

7878
writeln!(
79-
&mut rendered,
79+
rendered,
8080
"{}[{}{}]({}) {} {} **{}** [start]({}/log.txt) | [end]({}/log.txt)",
8181
prefix,
8282
krate.name,
@@ -90,7 +90,7 @@ fn write_crate(
9090
)?;
9191
} else {
9292
writeln!(
93-
&mut rendered,
93+
rendered,
9494
"{}[{}{}]({}) {} [start]({}/log.txt) | [end]({}/log.txt)",
9595
prefix, krate.name, status_warning, krate.url, comparison, runs[1], runs[3]
9696
)?;
@@ -103,10 +103,10 @@ fn render_markdown(context: &ResultsContext) -> Fallible<String> {
103103
let mut rendered = String::new();
104104

105105
//add title
106-
writeln!(&mut rendered, "# Crater report for {}\n\n", context.ex.name)?;
106+
writeln!(rendered, "# Crater report for {}\n\n", context.ex.name)?;
107107

108108
for (comparison, results) in context.categories.iter() {
109-
writeln!(&mut rendered, "\n### {}", comparison)?;
109+
writeln!(rendered, "\n### {}", comparison)?;
110110
match results {
111111
ReportCratesMD::Plain(crates) => {
112112
for krate in crates {
@@ -123,7 +123,7 @@ fn render_markdown(context: &ResultsContext) -> Fallible<String> {
123123

124124
for (krate, deps) in orphans {
125125
writeln!(
126-
&mut rendered,
126+
rendered,
127127
"* [{}]({}) (not covered in crater testing)",
128128
krate,
129129
crate_to_url(krate)

src/server/metrics.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@ use prometheus::{
1111

1212
const JOBS_METRIC: &str = "crater_completed_jobs_total";
1313
const AGENT_WORK_METRIC: &str = "crater_agent_supposed_to_work";
14+
const AGENT_FAILED: &str = "crater_agent_failure";
1415
const LAST_CRATES_UPDATE_METRIC: &str = "crater_last_crates_update";
1516

1617
#[derive(Clone)]
1718
pub struct Metrics {
1819
crater_completed_jobs_total: IntCounterVec,
20+
crater_agent_failure: IntCounterVec,
1921
crater_work_status: IntGaugeVec,
2022
crater_last_crates_update: IntGauge,
2123
}
@@ -25,6 +27,9 @@ impl Metrics {
2527
let jobs_opts = prometheus::opts!(JOBS_METRIC, "total completed jobs");
2628
let crater_completed_jobs_total =
2729
prometheus::register_int_counter_vec!(jobs_opts, &["agent", "experiment"])?;
30+
let failure_opts = prometheus::opts!(AGENT_FAILED, "total completed jobs");
31+
let crater_agent_failure =
32+
prometheus::register_int_counter_vec!(failure_opts, &["agent", "experiment"])?;
2833
let agent_opts = prometheus::opts!(AGENT_WORK_METRIC, "is agent supposed to work");
2934
let crater_work_status = prometheus::register_int_gauge_vec!(agent_opts, &["agent"])?;
3035
let crates_update_opts =
@@ -33,11 +38,18 @@ impl Metrics {
3338

3439
Ok(Metrics {
3540
crater_completed_jobs_total,
41+
crater_agent_failure,
3642
crater_work_status,
3743
crater_last_crates_update,
3844
})
3945
}
4046

47+
pub fn record_error(&self, agent: &str, experiment: &str) {
48+
self.crater_agent_failure
49+
.with_label_values(&[agent, experiment])
50+
.inc_by(1);
51+
}
52+
4153
pub fn record_completed_jobs(&self, agent: &str, experiment: &str, amount: i64) {
4254
self.crater_completed_jobs_total
4355
.with_label_values(&[agent, experiment])

src/server/routes/agent.rs

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pub fn routes(
5151
.and(warp::path("next-experiment"))
5252
.and(warp::path::end())
5353
.and(mutex_filter.clone())
54-
.and(github_data_filter.clone())
54+
.and(github_data_filter)
5555
.and(auth_filter(data.clone(), TokenType::Agent))
5656
.map(endpoint_next_experiment);
5757

@@ -75,7 +75,6 @@ pub fn routes(
7575
.and(warp::path::end())
7676
.and(warp::body::json())
7777
.and(mutex_filter)
78-
.and(github_data_filter)
7978
.and(auth_filter(data, TokenType::Agent))
8079
.map(endpoint_error);
8180

@@ -197,38 +196,22 @@ fn endpoint_heartbeat(data: Arc<Data>, auth: AuthDetails) -> Fallible<Response<B
197196
fn endpoint_error(
198197
error: ExperimentData<HashMap<String, String>>,
199198
mutex: Arc<Mutex<Data>>,
200-
github_data: Option<Arc<GithubData>>,
201199
auth: AuthDetails,
202200
) -> Fallible<Response<Body>> {
201+
log::error!(
202+
"agent {} failed while running {}: {:?}",
203+
auth.name,
204+
error.experiment_name,
205+
error.data.get("error")
206+
);
207+
203208
let data = mutex.lock().unwrap();
204209
let mut ex = Experiment::get(&data.db, &error.experiment_name)?
205210
.ok_or_else(|| err_msg("no experiment run by this agent"))?;
206211

207-
//also set status to failed
208-
ex.report_failure(&data.db, &Assignee::Agent(auth.name))?;
212+
data.metrics.record_error(&auth.name, &ex.name);
213+
ex.handle_failure(&data.db, &Assignee::Agent(auth.name))?;
209214

210-
if let Some(github_data) = github_data.as_ref() {
211-
if let Some(ref github_issue) = ex.github_issue {
212-
Message::new()
213-
.line(
214-
"rotating_light",
215-
format!(
216-
"Experiment **`{}`** has encountered an error: {}",
217-
ex.name,
218-
error.data.get("error").unwrap_or(&String::from("no error")),
219-
),
220-
)
221-
.line(
222-
"hammer_and_wrench",
223-
"If the error is fixed use the `retry` command.",
224-
)
225-
.note(
226-
"sos",
227-
"Can someone from the infra team check in on this? @rust-lang/infra",
228-
)
229-
.send(&github_issue.api_url, &data, github_data)?;
230-
}
231-
}
232215
Ok(ApiResponse::Success { result: true }.into_response()?)
233216
}
234217

src/toolchain.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ impl FromStr for Toolchain {
118118
for part in parts {
119119
if let Some(equal_idx) = part.find('=') {
120120
let (flag, value_with_equal) = part.split_at(equal_idx);
121-
let value = (&value_with_equal[1..]).to_string();
121+
let value = value_with_equal[1..].to_string();
122122

123123
if value.is_empty() {
124124
return Err(ToolchainParseError::InvalidFlag(flag.to_string()));

0 commit comments

Comments
 (0)