Skip to content

Commit 8491684

Browse files
committed
Auto merge of #710 - Mark-Simulacrum:track-worker-count, r=Mark-Simulacrum
In-memory tracking for active worker count
2 parents 4d35849 + 8ed29a5 commit 8491684

File tree

4 files changed

+54
-2
lines changed

4 files changed

+54
-2
lines changed

src/agent/api.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,15 @@ impl ResponseExt for ::reqwest::blocking::Response {
6868
pub struct AgentApi {
6969
url: String,
7070
token: String,
71+
random_id: String,
7172
}
7273

7374
impl AgentApi {
7475
pub fn new(url: &str, token: &str) -> Self {
7576
AgentApi {
7677
url: url.to_string(),
7778
token: token.to_string(),
79+
random_id: format!("{:X}{:X}", rand::random::<u64>(), rand::random::<u64>()),
7880
}
7981
}
8082

@@ -200,6 +202,9 @@ impl AgentApi {
200202
self.retry(|this| {
201203
let _: bool = this
202204
.build_request(Method::POST, "heartbeat")
205+
.json(&json!({
206+
"id": self.random_id,
207+
}))
203208
.send()?
204209
.to_api_response()?;
205210
Ok(())

src/server/agents.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ use crate::prelude::*;
55
use crate::server::tokens::Tokens;
66
use chrono::Duration;
77
use chrono::{DateTime, Utc};
8+
use std::collections::HashMap;
89
use std::collections::HashSet;
10+
use std::sync::{Arc, Mutex};
911

1012
/// Number of seconds without an heartbeat after an agent should be considered unreachable.
1113
const INACTIVE_AFTER: i64 = 300;
@@ -74,15 +76,41 @@ impl Agent {
7476
#[derive(Clone)]
7577
pub struct Agents {
7678
db: Database,
79+
// worker -> timestamp
80+
current_workers: Arc<Mutex<HashMap<String, (WorkerInfo, std::time::Instant)>>>,
81+
}
82+
83+
#[derive(Deserialize)]
84+
pub struct WorkerInfo {
85+
id: String,
7786
}
7887

7988
impl Agents {
8089
pub fn new(db: Database, tokens: &Tokens) -> Fallible<Self> {
81-
let agents = Agents { db };
90+
let agents = Agents {
91+
db,
92+
current_workers: Arc::new(Mutex::new(HashMap::new())),
93+
};
8294
agents.synchronize(tokens)?;
8395
Ok(agents)
8496
}
8597

98+
pub fn active_worker_count(&self) -> usize {
99+
let mut guard = self.current_workers.lock().unwrap();
100+
guard.retain(|_, (_, timestamp)| {
101+
// It's been 10 minutes since we heard from this worker, drop it from our active list.
102+
timestamp.elapsed() > std::time::Duration::from_secs(60 * 10)
103+
});
104+
guard.len()
105+
}
106+
107+
pub fn add_worker(&self, id: WorkerInfo) {
108+
self.current_workers
109+
.lock()
110+
.unwrap()
111+
.insert(id.id.clone(), (id, std::time::Instant::now()));
112+
}
113+
86114
fn synchronize(&self, tokens: &Tokens) -> Fallible<()> {
87115
self.db.transaction(|trans| {
88116
let mut real = tokens.agents.values().collect::<HashSet<&String>>();

src/server/metrics.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ const AGENT_WORK_METRIC: &str = "crater_agent_supposed_to_work";
1010
const AGENT_FAILED: &str = "crater_agent_failure";
1111
const LAST_CRATES_UPDATE_METRIC: &str = "crater_last_crates_update";
1212
const ENDPOINT_TIME: &str = "crater_endpoint_time_seconds";
13+
const WORKER_COUNT: &str = "crater_worker_count";
1314

1415
#[derive(Clone)]
1516
pub struct Metrics {
@@ -19,6 +20,7 @@ pub struct Metrics {
1920
crater_work_status: IntGaugeVec,
2021
crater_last_crates_update: IntGauge,
2122
pub crater_endpoint_time: HistogramVec,
23+
crater_worker_count: IntGauge,
2224
}
2325

2426
impl Metrics {
@@ -46,16 +48,24 @@ impl Metrics {
4648
&["endpoint"]
4749
)?;
4850

51+
let crater_worker_count = prometheus::opts!(WORKER_COUNT, "number of active workers");
52+
let crater_worker_count = prometheus::register_int_gauge!(crater_worker_count)?;
53+
4954
Ok(Metrics {
5055
crater_completed_jobs_total,
5156
crater_bounced_record_progress,
5257
crater_agent_failure,
5358
crater_work_status,
5459
crater_last_crates_update,
5560
crater_endpoint_time,
61+
crater_worker_count,
5662
})
5763
}
5864

65+
pub fn record_worker_count(&self, count: usize) {
66+
self.crater_worker_count.set(count as i64);
67+
}
68+
5969
pub fn record_error(&self, agent: &str, experiment: &str) {
6070
self.crater_agent_failure
6171
.with_label_values(&[agent, experiment])

src/server/routes/agent.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use crate::agent::Capabilities;
22
use crate::experiments::{Assignee, Experiment};
33
use crate::prelude::*;
44
use crate::results::{DatabaseDB, EncodingType, ProgressData};
5+
use crate::server::agents::WorkerInfo;
56
use crate::server::api_types::{AgentConfig, ApiResponse};
67
use crate::server::auth::{auth_filter, AuthDetails};
78
use crate::server::messages::Message;
@@ -68,6 +69,7 @@ pub fn routes(
6869
let heartbeat = warp::post()
6970
.and(warp::path("heartbeat"))
7071
.and(warp::path::end())
72+
.and(warp::body::json())
7173
.and(data_filter)
7274
.and(auth_filter(data.clone()))
7375
.map(endpoint_heartbeat);
@@ -318,12 +320,19 @@ fn endpoint_record_progress(
318320
ret
319321
}
320322

321-
fn endpoint_heartbeat(data: Arc<Data>, auth: AuthDetails) -> Fallible<Response<Body>> {
323+
fn endpoint_heartbeat(
324+
id: WorkerInfo,
325+
data: Arc<Data>,
326+
auth: AuthDetails,
327+
) -> Fallible<Response<Body>> {
328+
data.agents.add_worker(id);
322329
if let Some(rev) = auth.git_revision {
323330
data.agents.set_git_revision(&auth.name, &rev)?;
324331
}
325332

326333
data.agents.record_heartbeat(&auth.name)?;
334+
data.metrics
335+
.record_worker_count(data.agents.active_worker_count());
327336
Ok(ApiResponse::Success { result: true }.into_response()?)
328337
}
329338

0 commit comments

Comments
 (0)