Skip to content

Commit 38e389f

Browse files
authored
Merge pull request #6008 from fdefelici/feat/signer-mon-conflicts
feat: add monitoring for signer agreement state conflict
2 parents 62202cf + 3020f3a commit 38e389f

File tree

5 files changed

+214
-4
lines changed

5 files changed

+214
-4
lines changed

stacks-signer/src/monitoring/mod.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,21 @@ SignerAgreementStateChangeReason {
3333
InactiveMiner("inactive_miner"),
3434
/// Signer agreement protocol version has been upgraded
3535
ProtocolUpgrade("protocol_upgrade"),
36+
/// An update related to the Miner view
37+
MinerViewUpdate("miner_view_update"),
38+
/// A specific Miner View update related to the parent tenure
39+
MinerParentTenureUpdate("miner_parent_tenure_update"),
40+
});
41+
42+
define_named_enum!(
43+
/// Represent different conflict types on signer agreement protocol
44+
SignerAgreementStateConflict {
45+
/// Waiting for burn block propagation to be aligned with the signer set
46+
BurnBlockDelay("burn_block_delay"),
47+
/// Waiting for stacks block propagation to be aligned with the signer set
48+
StacksBlockDelay("stacks_block_delay"),
49+
/// No agreement on miner view with the signer set
50+
MinerView("miner_view"),
3651
});
3752

3853
/// Actions for updating metrics
@@ -44,7 +59,7 @@ pub mod actions {
4459

4560
use crate::config::GlobalConfig;
4661
use crate::monitoring::prometheus::*;
47-
use crate::monitoring::SignerAgreementStateChangeReason;
62+
use crate::monitoring::{SignerAgreementStateChangeReason, SignerAgreementStateConflict};
4863
use crate::v0::signer_state::LocalStateMachine;
4964

5065
/// Update stacks tip height gauge
@@ -134,6 +149,21 @@ pub mod actions {
134149
.inc();
135150
}
136151

152+
/// Increment signer agreement state conflict counter
153+
pub fn increment_signer_agreement_state_conflict(conflict: SignerAgreementStateConflict) {
154+
let label_value = conflict.get_name();
155+
SIGNER_AGREEMENT_STATE_CONFLICTS
156+
.with_label_values(&[&label_value])
157+
.inc();
158+
}
159+
160+
/// Record the time (seconds) taken for a signer to agree with the signer set
161+
pub fn record_signer_agreement_capitulation_latency(latency_s: u64) {
162+
SIGNER_AGREEMENT_CAPITULATION_LATENCIES_HISTOGRAM
163+
.with_label_values(&[])
164+
.observe(latency_s as f64);
165+
}
166+
137167
/// Start serving monitoring metrics.
138168
/// This will only serve the metrics if the `monitoring_prom` feature is enabled.
139169
pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> {
@@ -157,7 +187,7 @@ pub mod actions {
157187
use blockstack_lib::chainstate::nakamoto::NakamotoBlock;
158188
use stacks_common::info;
159189

160-
use crate::monitoring::SignerAgreementStateChangeReason;
190+
use crate::monitoring::{SignerAgreementStateChangeReason, SignerAgreementStateConflict};
161191
use crate::v0::signer_state::LocalStateMachine;
162192
use crate::GlobalConfig;
163193

@@ -212,6 +242,12 @@ pub mod actions {
212242
) {
213243
}
214244

245+
/// Increment signer agreement state conflict counter
246+
pub fn increment_signer_agreement_state_conflict(_conflict: SignerAgreementStateConflict) {}
247+
248+
/// Record the time (seconds) taken for a signer to agree with the signer set
249+
pub fn record_signer_agreement_capitulation_latency(_latency_s: u64) {}
250+
215251
/// Start serving monitoring metrics.
216252
/// This will only serve the metrics if the `monitoring_prom` feature is enabled.
217253
pub fn start_serving_monitoring_metrics(config: GlobalConfig) -> Result<(), String> {

stacks-signer/src/monitoring/prometheus.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,22 @@ lazy_static! {
8181

8282
pub static ref SIGNER_AGREEMENT_STATE_CHANGE_REASONS: IntCounterVec = register_int_counter_vec!(
8383
"stacks_signer_agreement_state_change_reasons",
84-
"The number of state machine changes in signer agreement protocol. `reason` can be one of: 'burn_block_arrival', 'stacks_block_arrival', 'inactive_miner', 'protocol_upgrade'",
84+
"The number of state machine changes in signer agreement protocol. `reason` can be one of: 'burn_block_arrival', 'stacks_block_arrival', 'inactive_miner', 'protocol_upgrade', 'miner_view_update', 'miner_parent_tenure_update'",
8585
&["reason"]
8686
).unwrap();
8787

88+
pub static ref SIGNER_AGREEMENT_STATE_CONFLICTS: IntCounterVec = register_int_counter_vec!(
89+
"stacks_signer_agreement_state_conflicts",
90+
"The number of state machine conflicts in signer agreement protocol. `conflict` can be one of: 'burn_block_delay', 'stacks_block_delay', 'miner_view'",
91+
&["conflict"]
92+
).unwrap();
93+
94+
pub static ref SIGNER_AGREEMENT_CAPITULATION_LATENCIES_HISTOGRAM: HistogramVec = register_histogram_vec!(histogram_opts!(
95+
"stacks_signer_agreement_capitulation_latencies_histogram",
96+
"Measuring the time (in seconds) for the signer to agree (capitulate) with the signer set",
97+
vec![0.0, 1.0, 3.0, 5.0, 10.0, 20.0, 30.0, 60.0, 120.0]
98+
), &[]).unwrap();
99+
88100
pub static ref SIGNER_LOCAL_STATE_MACHINE: Mutex<Option<LocalStateMachine>> = Mutex::new(None);
89101
}
90102

stacks-signer/src/signerdb.rs

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ CREATE TABLE IF NOT EXISTS signer_state_machine_updates (
528528
signer_addr TEXT NOT NULL,
529529
reward_cycle INTEGER NOT NULL,
530530
state_update TEXT NOT NULL,
531-
received_time TEXT NOT NULL,
531+
received_time INTEGER NOT NULL,
532532
PRIMARY KEY (signer_addr, reward_cycle)
533533
) STRICT;"#;
534534

@@ -1412,6 +1412,22 @@ impl SignerDb {
14121412
}
14131413
Ok(result)
14141414
}
1415+
1416+
/// Retrieve the elapsed time (in seconds) between
1417+
/// the oldest and the newest state machine update messages
1418+
/// produced by the signer set
1419+
pub fn get_signer_state_machine_updates_latency(
1420+
&self,
1421+
reward_cycle: u64,
1422+
) -> Result<u64, DBError> {
1423+
let query = "SELECT COALESCE( (MAX(received_time) - MIN(received_time)), 0 ) AS elapsed_time FROM signer_state_machine_updates WHERE reward_cycle = ?1";
1424+
let args = params![u64_to_sql(reward_cycle)?];
1425+
let elapsed_time_opt: Option<u64> = query_row(&self.db, query, args)?;
1426+
match elapsed_time_opt {
1427+
Some(seconds) => Ok(seconds),
1428+
None => Ok(0),
1429+
}
1430+
}
14151431
}
14161432

14171433
fn try_deserialize<T>(s: Option<String>) -> Result<Option<T>, DBError>
@@ -2515,4 +2531,92 @@ pub mod tests {
25152531
assert_eq!(updates.get(&address_2), None);
25162532
assert_eq!(updates.get(&address_3), Some(&update_3));
25172533
}
2534+
2535+
#[test]
2536+
fn retrieve_latency_for_signer_state_machine_updates() {
2537+
let db_path = tmp_db_path();
2538+
let mut db = SignerDb::new(db_path).expect("Failed to create signer db");
2539+
let reward_cycle_1 = 1;
2540+
let address_1 = StacksAddress::p2pkh(false, &StacksPublicKey::new());
2541+
let update_1 = StateMachineUpdate::new(
2542+
0,
2543+
3,
2544+
StateMachineUpdateContent::V0 {
2545+
burn_block: ConsensusHash([0x55; 20]),
2546+
burn_block_height: 100,
2547+
current_miner: StateMachineUpdateMinerState::ActiveMiner {
2548+
current_miner_pkh: Hash160([0xab; 20]),
2549+
tenure_id: ConsensusHash([0x44; 20]),
2550+
parent_tenure_id: ConsensusHash([0x22; 20]),
2551+
parent_tenure_last_block: StacksBlockId([0x33; 32]),
2552+
parent_tenure_last_block_height: 1,
2553+
},
2554+
},
2555+
)
2556+
.unwrap();
2557+
let time_1 = SystemTime::UNIX_EPOCH;
2558+
2559+
let address_2 = StacksAddress::p2pkh(false, &StacksPublicKey::new());
2560+
let update_2 = StateMachineUpdate::new(
2561+
0,
2562+
4,
2563+
StateMachineUpdateContent::V0 {
2564+
burn_block: ConsensusHash([0x55; 20]),
2565+
burn_block_height: 100,
2566+
current_miner: StateMachineUpdateMinerState::NoValidMiner,
2567+
},
2568+
)
2569+
.unwrap();
2570+
let time_2 = SystemTime::UNIX_EPOCH + Duration::from_secs(1);
2571+
2572+
let address_3 = StacksAddress::p2pkh(false, &StacksPublicKey::new());
2573+
let update_3 = StateMachineUpdate::new(
2574+
0,
2575+
2,
2576+
StateMachineUpdateContent::V0 {
2577+
burn_block: ConsensusHash([0x66; 20]),
2578+
burn_block_height: 101,
2579+
current_miner: StateMachineUpdateMinerState::NoValidMiner,
2580+
},
2581+
)
2582+
.unwrap();
2583+
let time_3 = SystemTime::UNIX_EPOCH + Duration::from_secs(10);
2584+
2585+
assert_eq!(
2586+
0,
2587+
db.get_signer_state_machine_updates_latency(reward_cycle_1)
2588+
.unwrap(),
2589+
"latency on empty database should be 0 seconds for reward_cycle {reward_cycle_1}"
2590+
);
2591+
2592+
db.insert_state_machine_update(reward_cycle_1, &address_1, &update_1, &time_1)
2593+
.expect("Unable to insert block into db");
2594+
2595+
assert_eq!(
2596+
0,
2597+
db.get_signer_state_machine_updates_latency(reward_cycle_1)
2598+
.unwrap(),
2599+
"latency between same update should be 0 seconds"
2600+
);
2601+
2602+
db.insert_state_machine_update(reward_cycle_1, &address_2, &update_2, &time_2)
2603+
.expect("Unable to insert block into db");
2604+
2605+
assert_eq!(
2606+
1,
2607+
db.get_signer_state_machine_updates_latency(reward_cycle_1)
2608+
.unwrap(),
2609+
"latency between updates should be 1 second"
2610+
);
2611+
2612+
db.insert_state_machine_update(reward_cycle_1, &address_3, &update_3, &time_3)
2613+
.expect("Unable to insert block into db");
2614+
2615+
assert_eq!(
2616+
10,
2617+
db.get_signer_state_machine_updates_latency(reward_cycle_1)
2618+
.unwrap(),
2619+
"latency between updates should be 10 second"
2620+
);
2621+
}
25182622
}

stacks-signer/src/v0/signer.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,7 @@ impl Signer {
734734
&mut self.global_state_evaluator,
735735
self.stacks_address,
736736
version,
737+
self.reward_cycle,
737738
);
738739
}
739740

stacks-signer/src/v0/signer_state.rs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ impl GlobalStateEvaluator {
200200
};
201201
let (global_burn_view, _) = self.determine_global_burn_view(local_address, local_update)?;
202202
if current_burn_block != global_burn_view {
203+
crate::monitoring::actions::increment_signer_agreement_state_conflict(
204+
crate::monitoring::SignerAgreementStateConflict::BurnBlockDelay,
205+
);
203206
return None;
204207
}
205208
let mut current_miners = HashMap::new();
@@ -240,6 +243,9 @@ impl GlobalStateEvaluator {
240243
}
241244
}
242245
}
246+
crate::monitoring::actions::increment_signer_agreement_state_conflict(
247+
crate::monitoring::SignerAgreementStateConflict::MinerView,
248+
);
243249
None
244250
}
245251

@@ -822,6 +828,7 @@ impl LocalStateMachine {
822828
eval: &mut GlobalStateEvaluator,
823829
local_address: StacksAddress,
824830
local_supported_signer_protocol_version: u64,
831+
reward_cycle: u64,
825832
) {
826833
// Before we ever access eval...we should make sure to include our own local state machine update message in the evaluation
827834
let Ok(mut local_update) =
@@ -859,6 +866,9 @@ impl LocalStateMachine {
859866

860867
if active_signer_protocol_version != old_protocol_version {
861868
info!("Updating active signer protocol version from {old_protocol_version} to {active_signer_protocol_version}");
869+
crate::monitoring::actions::increment_signer_agreement_state_change_reason(
870+
crate::monitoring::SignerAgreementStateChangeReason::ProtocolUpgrade,
871+
);
862872
*self = Self::Initialized(SignerStateMachine {
863873
burn_block: *burn_block,
864874
burn_block_height: *burn_block_height,
@@ -907,6 +917,12 @@ impl LocalStateMachine {
907917
"current_miner" => ?current_miner,
908918
"new_miner" => ?new_miner,
909919
);
920+
crate::monitoring::actions::increment_signer_agreement_state_change_reason(
921+
crate::monitoring::SignerAgreementStateChangeReason::MinerViewUpdate,
922+
);
923+
Self::monitor_miner_parent_tenure_update(&current_miner, &new_miner);
924+
Self::monitor_capitulation_latency(signerdb, reward_cycle);
925+
910926
*self = Self::Initialized(SignerStateMachine {
911927
burn_block,
912928
burn_block_height,
@@ -917,6 +933,47 @@ impl LocalStateMachine {
917933
}
918934
}
919935

936+
#[allow(unused_variables)]
937+
fn monitor_miner_parent_tenure_update(
938+
current_miner: &StateMachineUpdateMinerState,
939+
new_miner: &StateMachineUpdateMinerState,
940+
) {
941+
#[cfg(feature = "monitoring_prom")]
942+
if let (
943+
StateMachineUpdateMinerState::ActiveMiner {
944+
parent_tenure_id: current_parent_tenure,
945+
..
946+
},
947+
StateMachineUpdateMinerState::ActiveMiner {
948+
parent_tenure_id: new_parent_tenure,
949+
..
950+
},
951+
) = (&current_miner, &new_miner)
952+
{
953+
if current_parent_tenure != new_parent_tenure {
954+
crate::monitoring::actions::increment_signer_agreement_state_change_reason(
955+
crate::monitoring::SignerAgreementStateChangeReason::MinerParentTenureUpdate,
956+
);
957+
}
958+
}
959+
}
960+
961+
#[allow(unused_variables)]
962+
fn monitor_capitulation_latency(signer_db: &SignerDb, reward_cycle: u64) {
963+
#[cfg(feature = "monitoring_prom")]
964+
{
965+
let latency_result = signer_db.get_signer_state_machine_updates_latency(reward_cycle);
966+
match latency_result {
967+
Ok(seconds) => {
968+
crate::monitoring::actions::record_signer_agreement_capitulation_latency(
969+
seconds,
970+
)
971+
}
972+
Err(e) => warn!("Failed to retrieve state updates latency in signerdb: {e}"),
973+
}
974+
}
975+
}
976+
920977
/// Extract out the tx replay set if it exists
921978
pub fn get_tx_replay_set(&self) -> Option<Vec<StacksTransaction>> {
922979
let Self::Initialized(state) = self else {

0 commit comments

Comments
 (0)