Skip to content

Commit 75a7d44

Browse files
authored
Fix Back-and-forth RELOAD Bug (#330)
We identified a bug where RELOAD fails to update the pools. To reproduce you need to start at some config state, modify that state a bit, reload, revert the configs back to the original state, and reload. The last reload will fail to update the pool because PgCat "thinks" the pool state didn't change. This is because we use a HashSet to keep track of config hashes but we never remove values from it. Say we start with State A, we modify pool configs to State B and reload. Now the POOL_HASHES struct has State A and State B. Attempting to go back to State A will encounter a hashset hit which is interpreted by PgCat as "Configs are the same, no need to reload pools" We fix this by attaching a config_hash value to ConnectionPool object and we calculate that value when we create the pool. This eliminates the need for a global variable. One shortcoming here is that changing any config under one user in the pool will trigger a reload for the entire pool (which is fine I think)
1 parent 37e1c52 commit 75a7d44

File tree

5 files changed

+75
-17
lines changed

5 files changed

+75
-17
lines changed

src/config.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ use log::{error, info};
44
use once_cell::sync::Lazy;
55
use regex::Regex;
66
use serde_derive::{Deserialize, Serialize};
7+
use std::collections::hash_map::DefaultHasher;
78
use std::collections::{BTreeMap, HashMap, HashSet};
8-
use std::hash::Hash;
9+
use std::hash::{Hash, Hasher};
910
use std::path::Path;
1011
use std::sync::Arc;
1112
use tokio::fs::File;
@@ -355,6 +356,12 @@ pub struct Pool {
355356
}
356357

357358
impl Pool {
359+
pub fn hash_value(&self) -> u64 {
360+
let mut s = DefaultHasher::new();
361+
self.hash(&mut s);
362+
s.finish()
363+
}
364+
358365
pub fn default_pool_mode() -> PoolMode {
359366
PoolMode::Transaction
360367
}

src/pool.rs

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use parking_lot::{Mutex, RwLock};
99
use rand::seq::SliceRandom;
1010
use rand::thread_rng;
1111
use regex::Regex;
12-
use std::collections::{HashMap, HashSet};
12+
use std::collections::HashMap;
1313
use std::sync::{
1414
atomic::{AtomicBool, Ordering},
1515
Arc,
@@ -37,8 +37,6 @@ pub type PoolMap = HashMap<PoolIdentifier, ConnectionPool>;
3737
/// This is atomic and safe and read-optimized.
3838
/// The pool is recreated dynamically when the config is reloaded.
3939
pub static POOLS: Lazy<ArcSwap<PoolMap>> = Lazy::new(|| ArcSwap::from_pointee(HashMap::default()));
40-
static POOLS_HASH: Lazy<ArcSwap<HashSet<crate::config::Pool>>> =
41-
Lazy::new(|| ArcSwap::from_pointee(HashSet::default()));
4240

4341
/// An identifier for a PgCat pool,
4442
/// a database visible to clients.
@@ -168,6 +166,11 @@ pub struct ConnectionPool {
168166
/// to use it.
169167
validated: Arc<AtomicBool>,
170168

169+
/// Hash value for the pool configs. It is used to compare new configs
170+
/// against current config to decide whether or not we need to recreate
171+
/// the pool after a RELOAD command
172+
pub config_hash: u64,
173+
171174
/// If the pool has been paused or not.
172175
paused: Arc<AtomicBool>,
173176
paused_waiter: Arc<Notify>,
@@ -181,18 +184,18 @@ impl ConnectionPool {
181184
let mut new_pools = HashMap::new();
182185
let mut address_id = 0;
183186

184-
let mut pools_hash = (*(*POOLS_HASH.load())).clone();
185-
186187
for (pool_name, pool_config) in &config.pools {
187-
let changed = pools_hash.insert(pool_config.clone());
188+
let new_pool_hash_value = pool_config.hash_value();
188189

189190
// There is one pool per database/user pair.
190191
for user in pool_config.users.values() {
191-
// If the pool hasn't changed, get existing reference and insert it into the new_pools.
192-
// We replace all pools at the end, but if the reference is kept, the pool won't get re-created (bb8).
193-
if !changed {
194-
match get_pool(pool_name, &user.username) {
195-
Some(pool) => {
192+
let old_pool_ref = get_pool(pool_name, &user.username);
193+
194+
match old_pool_ref {
195+
Some(pool) => {
196+
// If the pool hasn't changed, get existing reference and insert it into the new_pools.
197+
// We replace all pools at the end, but if the reference is kept, the pool won't get re-created (bb8).
198+
if pool.config_hash == new_pool_hash_value {
196199
info!(
197200
"[pool: {}][user: {}] has not changed",
198201
pool_name, user.username
@@ -203,8 +206,8 @@ impl ConnectionPool {
203206
);
204207
continue;
205208
}
206-
None => (),
207209
}
210+
None => (),
208211
}
209212

210213
info!(
@@ -293,6 +296,7 @@ impl ConnectionPool {
293296
addresses,
294297
banlist: Arc::new(RwLock::new(banlist)),
295298
stats: get_reporter(),
299+
config_hash: new_pool_hash_value,
296300
server_info: Arc::new(RwLock::new(BytesMut::new())),
297301
settings: PoolSettings {
298302
pool_mode: pool_config.pool_mode,
@@ -342,8 +346,6 @@ impl ConnectionPool {
342346
}
343347

344348
POOLS.store(Arc::new(new_pools.clone()));
345-
POOLS_HASH.store(Arc::new(pools_hash.clone()));
346-
347349
Ok(())
348350
}
349351

tests/ruby/helpers/pgcat_process.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def current_config
5353

5454
def reload_config
5555
`kill -s HUP #{@pid}`
56-
sleep 0.1
56+
sleep 0.5
5757
end
5858

5959
def start

tests/ruby/load_balancing_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@
150150
end
151151
end
152152

153-
expect(failed_count).to eq(2)
153+
expect(failed_count).to be <= 2
154154
processes.all_databases.each do |instance|
155155
queries_routed = instance.count_select_1_plus_2
156156
if processes.replicas[0..1].include?(instance)

tests/ruby/misc_spec.rb

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,55 @@
88
processes.pgcat.shutdown
99
end
1010

11+
context "when adding then removing instance using RELOAD" do
12+
it "works correctly" do
13+
admin_conn = PG::connect(processes.pgcat.admin_connection_string)
14+
15+
current_configs = processes.pgcat.current_config
16+
correct_count = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].count
17+
expect(admin_conn.async_exec("SHOW DATABASES").count).to eq(correct_count)
18+
19+
extra_replica = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].last.clone
20+
extra_replica[0] = "127.0.0.1"
21+
current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"] << extra_replica
22+
23+
processes.pgcat.update_config(current_configs) # with replica added
24+
processes.pgcat.reload_config
25+
correct_count = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].count
26+
expect(admin_conn.async_exec("SHOW DATABASES").count).to eq(correct_count)
27+
28+
current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].pop
29+
30+
processes.pgcat.update_config(current_configs) # with replica removed again
31+
processes.pgcat.reload_config
32+
correct_count = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].count
33+
expect(admin_conn.async_exec("SHOW DATABASES").count).to eq(correct_count)
34+
end
35+
end
36+
37+
context "when removing then adding instance back using RELOAD" do
38+
it "works correctly" do
39+
admin_conn = PG::connect(processes.pgcat.admin_connection_string)
40+
41+
current_configs = processes.pgcat.current_config
42+
correct_count = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].count
43+
expect(admin_conn.async_exec("SHOW DATABASES").count).to eq(correct_count)
44+
45+
removed_replica = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].pop
46+
processes.pgcat.update_config(current_configs) # with replica removed
47+
processes.pgcat.reload_config
48+
correct_count = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].count
49+
expect(admin_conn.async_exec("SHOW DATABASES").count).to eq(correct_count)
50+
51+
current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"] << removed_replica
52+
53+
processes.pgcat.update_config(current_configs) # with replica added again
54+
processes.pgcat.reload_config
55+
correct_count = current_configs["pools"]["sharded_db"]["shards"]["0"]["servers"].count
56+
expect(admin_conn.async_exec("SHOW DATABASES").count).to eq(correct_count)
57+
end
58+
end
59+
1160
describe "TCP Keepalives" do
1261
# Ideally, we should block TCP traffic to the database using
1362
# iptables to mimic passive (connection is dropped without a RST packet)

0 commit comments

Comments
 (0)