|
| 1 | +use crate::stats::ClientStats; |
| 2 | +use chrono::naive::NaiveDateTime; |
| 3 | +use log::{debug, error, warn}; |
| 4 | +use parking_lot::RwLock; |
| 5 | +use std::sync::Arc; |
| 6 | + |
| 7 | +use std::collections::HashMap; |
| 8 | + |
| 9 | +use crate::config::{Address, Role}; |
| 10 | +pub type BanList = Arc<RwLock<Vec<HashMap<Address, (BanReason, NaiveDateTime)>>>>; |
| 11 | +#[derive(Debug, Clone, Default)] |
| 12 | +pub struct BanService { |
| 13 | + /// List of banned addresses (see above) |
| 14 | + /// that should not be queried. |
| 15 | + banlist: BanList, |
| 16 | + |
| 17 | + /// Whether or not we should use primary when replicas are unavailable |
| 18 | + pub replica_to_primary_failover_enabled: bool, |
| 19 | + |
| 20 | + /// Ban time (in seconds) |
| 21 | + pub ban_time: i64, |
| 22 | +} |
| 23 | + |
| 24 | +// Reasons for banning a server. |
| 25 | +#[derive(Debug, PartialEq, Clone)] |
| 26 | +pub enum BanReason { |
| 27 | + FailedHealthCheck, |
| 28 | + MessageSendFailed, |
| 29 | + MessageReceiveFailed, |
| 30 | + FailedCheckout, |
| 31 | + StatementTimeout, |
| 32 | + AdminBan(i64), |
| 33 | +} |
| 34 | + |
| 35 | +pub enum UnbanReason { |
| 36 | + AllReplicasBanned, |
| 37 | + BanTimeExceeded, |
| 38 | + PrimaryBanned, |
| 39 | + NotBanned, |
| 40 | +} |
| 41 | + |
| 42 | +impl BanService { |
| 43 | + pub fn new(replica_to_primary_failover_enabled: bool, ban_time: i64) -> Self { |
| 44 | + BanService { |
| 45 | + banlist: Arc::new(RwLock::new(vec![HashMap::new()])), |
| 46 | + replica_to_primary_failover_enabled, |
| 47 | + ban_time, |
| 48 | + } |
| 49 | + } |
| 50 | + |
| 51 | + /// Ban an address (i.e. replica). It no longer will serve |
| 52 | + /// traffic for any new transactions. Existing transactions on that replica |
| 53 | + /// will finish successfully or error out to the clients. |
| 54 | + pub fn ban(&self, address: &Address, reason: BanReason, client_info: Option<&ClientStats>) { |
| 55 | + // Count the number of errors since the last successful checkout |
| 56 | + // This is used to determine if the shard is down |
| 57 | + match reason { |
| 58 | + BanReason::FailedHealthCheck |
| 59 | + | BanReason::FailedCheckout |
| 60 | + | BanReason::MessageSendFailed |
| 61 | + | BanReason::MessageReceiveFailed => { |
| 62 | + address.increment_error_count(); |
| 63 | + } |
| 64 | + _ => (), |
| 65 | + }; |
| 66 | + |
| 67 | + // Primary can never be banned |
| 68 | + if address.role == Role::Primary { |
| 69 | + return; |
| 70 | + } |
| 71 | + |
| 72 | + let now = chrono::offset::Utc::now().naive_utc(); |
| 73 | + error!("Banning instance {:?}, reason: {:?}", address, reason); |
| 74 | + let mut guard = self.banlist.write(); |
| 75 | + |
| 76 | + if let Some(client_info) = client_info { |
| 77 | + client_info.ban_error(); |
| 78 | + address.stats.error(); |
| 79 | + } |
| 80 | + |
| 81 | + guard[address.shard].insert(address.clone(), (reason, now)); |
| 82 | + } |
| 83 | + |
| 84 | + /// Clear the replica to receive traffic again. Takes effect immediately |
| 85 | + /// for all new transactions. |
| 86 | + pub fn unban(&self, address: &Address) { |
| 87 | + warn!("Unbanning {:?}", address); |
| 88 | + let mut guard = self.banlist.write(); |
| 89 | + guard[address.shard].remove(address); |
| 90 | + } |
| 91 | + |
| 92 | + /// Check if address is banned |
| 93 | + /// true if banned, false otherwise |
| 94 | + pub fn is_banned(&self, address: &Address) -> bool { |
| 95 | + let guard = self.banlist.read(); |
| 96 | + |
| 97 | + match guard[address.shard].get(address) { |
| 98 | + Some(_) => true, |
| 99 | + None => { |
| 100 | + debug!("{:?} is ok", address); |
| 101 | + false |
| 102 | + } |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + /// Returns a list of banned replicas |
| 107 | + pub fn get_bans(&self) -> Vec<(Address, (BanReason, NaiveDateTime))> { |
| 108 | + let mut bans: Vec<(Address, (BanReason, NaiveDateTime))> = Vec::new(); |
| 109 | + let guard = self.banlist.read(); |
| 110 | + for banlist in guard.iter() { |
| 111 | + for (address, (reason, timestamp)) in banlist.iter() { |
| 112 | + bans.push((address.clone(), (reason.clone(), *timestamp))); |
| 113 | + } |
| 114 | + } |
| 115 | + bans |
| 116 | + } |
| 117 | + |
| 118 | + /// Unban all replicas in the shard |
| 119 | + /// This is typically used when all replicas are banned and |
| 120 | + /// we don't allow sending traffic to primary. |
| 121 | + pub fn unban_all_replicas(&self, address: &Address) { |
| 122 | + let mut write_guard = self.banlist.write(); |
| 123 | + warn!("Unbanning all replicas."); |
| 124 | + write_guard[address.shard].clear(); |
| 125 | + } |
| 126 | + |
| 127 | + /// Determines whether a replica should be unban and returns the reason |
| 128 | + /// why it should be unbanned. |
| 129 | + /// |
| 130 | + /// UnbanReason: |
| 131 | + /// - All replicas are banned (AllReplicasBanned) |
| 132 | + /// - Ban time is exceeded (BanTimeExceeded) |
| 133 | + /// - Primary is banned (PrimaryBanned, this should never happen) |
| 134 | + /// - Not banned (NotBanned, the replica was unbanned while checking the conditions) |
| 135 | + /// |
| 136 | + /// Returns: |
| 137 | + /// - Some(UnbanReason), if the replica should be unbanned |
| 138 | + /// - None, if the replica should not be unbanned |
| 139 | + pub fn should_unban( |
| 140 | + &self, |
| 141 | + pool_addresses: &[Vec<Address>], |
| 142 | + address: &Address, |
| 143 | + ) -> Option<UnbanReason> { |
| 144 | + // If somehow primary ends up being banned we should return true here |
| 145 | + if address.role == Role::Primary { |
| 146 | + return Some(UnbanReason::PrimaryBanned); |
| 147 | + } |
| 148 | + |
| 149 | + // If we have replica to primary failover we should not unban replicas |
| 150 | + // as we still have the primary to server traffic. |
| 151 | + if !self.replica_to_primary_failover_enabled { |
| 152 | + // Check if all replicas are banned, in that case unban all of them |
| 153 | + let replicas_available = pool_addresses[address.shard] |
| 154 | + .iter() |
| 155 | + .filter(|addr| addr.role == Role::Replica) |
| 156 | + .count(); |
| 157 | + |
| 158 | + debug!("Available targets: {}", replicas_available); |
| 159 | + |
| 160 | + let read_guard = self.banlist.read(); |
| 161 | + let all_replicas_banned = read_guard[address.shard].len() == replicas_available; |
| 162 | + drop(read_guard); |
| 163 | + |
| 164 | + if all_replicas_banned { |
| 165 | + return Some(UnbanReason::AllReplicasBanned); |
| 166 | + } |
| 167 | + } |
| 168 | + |
| 169 | + // Check if ban time is expired |
| 170 | + let read_guard = self.banlist.read(); |
| 171 | + let exceeded_ban_time = match read_guard[address.shard].get(address) { |
| 172 | + Some((ban_reason, timestamp)) => { |
| 173 | + let now = chrono::offset::Utc::now().naive_utc(); |
| 174 | + match ban_reason { |
| 175 | + BanReason::AdminBan(duration) => { |
| 176 | + now.timestamp() - timestamp.timestamp() > *duration |
| 177 | + } |
| 178 | + _ => now.timestamp() - timestamp.timestamp() > self.ban_time, |
| 179 | + } |
| 180 | + } |
| 181 | + None => return Some(UnbanReason::NotBanned), |
| 182 | + }; |
| 183 | + drop(read_guard); |
| 184 | + |
| 185 | + if exceeded_ban_time { |
| 186 | + Some(UnbanReason::BanTimeExceeded) |
| 187 | + } else { |
| 188 | + debug!("{:?} is banned", address); |
| 189 | + None |
| 190 | + } |
| 191 | + } |
| 192 | +} |
0 commit comments