Skip to content

Commit d0b7ab9

Browse files
Add metric for agent failures
1 parent 5557ef5 commit d0b7ab9

File tree

2 files changed

+13
-1
lines changed

2 files changed

+13
-1
lines changed

src/server/metrics.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@ use prometheus::{
1111

1212
const JOBS_METRIC: &str = "crater_completed_jobs_total";
1313
const AGENT_WORK_METRIC: &str = "crater_agent_supposed_to_work";
14+
const AGENT_FAILED: &str = "crater_agent_failure";
1415
const LAST_CRATES_UPDATE_METRIC: &str = "crater_last_crates_update";
1516

1617
#[derive(Clone)]
1718
pub struct Metrics {
1819
crater_completed_jobs_total: IntCounterVec,
20+
crater_agent_failure: IntCounterVec,
1921
crater_work_status: IntGaugeVec,
2022
crater_last_crates_update: IntGauge,
2123
}
@@ -25,6 +27,9 @@ impl Metrics {
2527
let jobs_opts = prometheus::opts!(JOBS_METRIC, "total completed jobs");
2628
let crater_completed_jobs_total =
2729
prometheus::register_int_counter_vec!(jobs_opts, &["agent", "experiment"])?;
30+
let failure_opts = prometheus::opts!(AGENT_FAILED, "total completed jobs");
31+
let crater_agent_failure =
32+
prometheus::register_int_counter_vec!(failure_opts, &["agent", "experiment"])?;
2833
let agent_opts = prometheus::opts!(AGENT_WORK_METRIC, "is agent supposed to work");
2934
let crater_work_status = prometheus::register_int_gauge_vec!(agent_opts, &["agent"])?;
3035
let crates_update_opts =
@@ -33,11 +38,18 @@ impl Metrics {
3338

3439
Ok(Metrics {
3540
crater_completed_jobs_total,
41+
crater_agent_failure,
3642
crater_work_status,
3743
crater_last_crates_update,
3844
})
3945
}
4046

47+
pub fn record_error(&self, agent: &str, experiment: &str) {
48+
self.crater_agent_failure
49+
.with_label_values(&[agent, experiment])
50+
.inc_by(1);
51+
}
52+
4153
pub fn record_completed_jobs(&self, agent: &str, experiment: &str, amount: i64) {
4254
self.crater_completed_jobs_total
4355
.with_label_values(&[agent, experiment])

src/server/routes/agent.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ fn endpoint_error(
209209
let mut ex = Experiment::get(&data.db, &error.experiment_name)?
210210
.ok_or_else(|| err_msg("no experiment run by this agent"))?;
211211

212-
//also set status to failed
212+
data.metrics.record_error(&auth.name, &ex.name);
213213
ex.handle_failure(&data.db, &Assignee::Agent(auth.name))?;
214214

215215
Ok(ApiResponse::Success { result: true }.into_response()?)

0 commit comments

Comments
 (0)