Skip to content

Commit 29afb2a

Browse files
authored
fix(hermes): tune readiness probe (#1753)
* fix(hermes): tune readiness probe This change modifies the not_behind readiness indicator to use the most recent slot from Pythnet accumulator instead of taking the max because we think that during forks validators might return an absurdly large slot. It also makes it more verbose by returning the indicators and the initial values for them so we can investigate better if the issue persists. * fix: use saturating sub to avoid overflow issue
1 parent 7ef9aa2 commit 29afb2a

File tree

4 files changed

+56
-23
lines changed

4 files changed

+56
-23
lines changed

apps/hermes/server/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apps/hermes/server/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "hermes"
3-
version = "0.5.14"
3+
version = "0.5.15"
44
description = "Hermes is an agent that provides Verified Prices from the Pythnet Pyth Oracle."
55
edition = "2021"
66

apps/hermes/server/src/api/rest/ready.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use {
1010
IntoResponse,
1111
Response,
1212
},
13+
Json,
1314
},
1415
};
1516

@@ -19,7 +20,7 @@ where
1920
{
2021
let state = &*state.state;
2122
match Aggregates::is_ready(state).await {
22-
true => (StatusCode::OK, "OK").into_response(),
23-
false => (StatusCode::SERVICE_UNAVAILABLE, "Service Unavailable").into_response(),
23+
(true, _) => (StatusCode::OK, "OK").into_response(),
24+
(false, metadata) => (StatusCode::SERVICE_UNAVAILABLE, Json(metadata)).into_response(),
2425
}
2526
}

apps/hermes/server/src/state/aggregate.rs

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
#[cfg(test)]
22
use mock_instant::{
3-
Instant,
43
SystemTime,
54
UNIX_EPOCH,
65
};
76
#[cfg(not(test))]
87
use std::time::{
9-
Instant,
108
SystemTime,
119
UNIX_EPOCH,
1210
};
@@ -56,6 +54,7 @@ use {
5654
},
5755
},
5856
},
57+
serde::Serialize,
5958
std::{
6059
collections::HashSet,
6160
time::Duration,
@@ -116,10 +115,13 @@ pub struct AggregateStateData {
116115
pub latest_completed_slot: Option<Slot>,
117116

118117
/// Time of the latest completed update. This is used for the health probes.
119-
pub latest_completed_update_at: Option<Instant>,
118+
pub latest_completed_update_time: Option<SystemTime>,
120119

121120
/// The latest observed slot among different Aggregate updates. This is used for the health
122-
/// probes.
121+
/// probes. The slot is not necessarily the maximum observed slot but it should be close
122+
/// to the maximum. The maximum observed slot is not used because sometimes due to some
123+
/// network issues we might receive an update with a much higher slot specially during
124+
/// the forks.
123125
pub latest_observed_slot: Option<Slot>,
124126

125127
/// The duration of no aggregation after which the readiness of the state is considered stale.
@@ -140,7 +142,7 @@ impl AggregateStateData {
140142
) -> Self {
141143
Self {
142144
latest_completed_slot: None,
143-
latest_completed_update_at: None,
145+
latest_completed_update_time: None,
144146
latest_observed_slot: None,
145147
metrics: metrics::Metrics::new(metrics_registry),
146148
readiness_staleness_threshold,
@@ -213,6 +215,17 @@ pub struct PriceFeedsWithUpdateData {
213215
pub update_data: Vec<Vec<u8>>,
214216
}
215217

218+
#[derive(Debug, Serialize)]
219+
pub struct ReadinessMetadata {
220+
pub has_completed_recently: bool,
221+
pub is_not_behind: bool,
222+
pub is_metadata_loaded: bool,
223+
pub latest_completed_slot: Option<Slot>,
224+
pub latest_observed_slot: Option<Slot>,
225+
pub latest_completed_unix_timestamp: Option<UnixTimestamp>,
226+
pub price_feeds_metadata_len: usize,
227+
}
228+
216229
#[async_trait::async_trait]
217230
pub trait Aggregates
218231
where
@@ -221,7 +234,7 @@ where
221234
Self: PriceFeedMeta,
222235
{
223236
fn subscribe(&self) -> Receiver<AggregationEvent>;
224-
async fn is_ready(&self) -> bool;
237+
async fn is_ready(&self) -> (bool, ReadinessMetadata);
225238
async fn store_update(&self, update: Update) -> Result<()>;
226239
async fn get_price_feed_ids(&self) -> HashSet<PriceIdentifier>;
227240
async fn get_price_feeds_with_update_data(
@@ -304,10 +317,7 @@ where
304317
// Update the aggregate state with the latest observed slot
305318
{
306319
let mut aggregate_state = self.into().data.write().await;
307-
aggregate_state.latest_observed_slot = aggregate_state
308-
.latest_observed_slot
309-
.map(|latest| latest.max(slot))
310-
.or(Some(slot));
320+
aggregate_state.latest_observed_slot = Some(slot);
311321
}
312322

313323
let accumulator_messages = self.fetch_accumulator_messages(slot).await?;
@@ -366,8 +376,8 @@ where
366376
.or(Some(slot));
367377

368378
aggregate_state
369-
.latest_completed_update_at
370-
.replace(Instant::now());
379+
.latest_completed_update_time
380+
.replace(SystemTime::now());
371381

372382
aggregate_state
373383
.metrics
@@ -401,15 +411,20 @@ where
401411
.collect()
402412
}
403413

404-
async fn is_ready(&self) -> bool {
414+
async fn is_ready(&self) -> (bool, ReadinessMetadata) {
405415
let state_data = self.into().data.read().await;
406416
let price_feeds_metadata = PriceFeedMeta::retrieve_price_feeds_metadata(self)
407417
.await
408418
.unwrap();
409419

410-
let has_completed_recently = match state_data.latest_completed_update_at.as_ref() {
420+
let current_time = SystemTime::now();
421+
422+
let has_completed_recently = match state_data.latest_completed_update_time {
411423
Some(latest_completed_update_time) => {
412-
latest_completed_update_time.elapsed() < state_data.readiness_staleness_threshold
424+
current_time
425+
.duration_since(latest_completed_update_time)
426+
.unwrap_or(Duration::from_secs(0))
427+
< state_data.readiness_staleness_threshold
413428
}
414429
None => false,
415430
};
@@ -419,14 +434,31 @@ where
419434
state_data.latest_observed_slot,
420435
) {
421436
(Some(latest_completed_slot), Some(latest_observed_slot)) => {
422-
latest_observed_slot - latest_completed_slot
437+
latest_observed_slot.saturating_sub(latest_completed_slot)
423438
<= state_data.readiness_max_allowed_slot_lag
424439
}
425440
_ => false,
426441
};
427442

428443
let is_metadata_loaded = !price_feeds_metadata.is_empty();
429-
has_completed_recently && is_not_behind && is_metadata_loaded
444+
(
445+
has_completed_recently && is_not_behind && is_metadata_loaded,
446+
ReadinessMetadata {
447+
has_completed_recently,
448+
is_not_behind,
449+
is_metadata_loaded,
450+
latest_completed_slot: state_data.latest_completed_slot,
451+
latest_observed_slot: state_data.latest_observed_slot,
452+
latest_completed_unix_timestamp: state_data.latest_completed_update_time.and_then(
453+
|t| {
454+
t.duration_since(UNIX_EPOCH)
455+
.map(|d| d.as_secs() as i64)
456+
.ok()
457+
},
458+
),
459+
price_feeds_metadata_len: price_feeds_metadata.len(),
460+
},
461+
)
430462
}
431463
}
432464

@@ -896,14 +928,14 @@ mod test {
896928
.unwrap();
897929

898930
// Check the state is ready
899-
assert!(state.is_ready().await);
931+
assert!(state.is_ready().await.0);
900932

901933
// Advance the clock to make the prices stale
902934
let staleness_threshold = Duration::from_secs(30);
903935
MockClock::advance_system_time(staleness_threshold);
904936
MockClock::advance(staleness_threshold);
905937
// Check the state is not ready
906-
assert!(!state.is_ready().await);
938+
assert!(!state.is_ready().await.0);
907939
}
908940

909941
/// Test that the state retains the latest slots upon cache eviction.

0 commit comments

Comments
 (0)