Skip to content

Commit 385ff29

Browse files
authored
Fix for stranded connection initiators (#3771)
There was a bug where the peer checked to see if it was an initiator via an invalid check where it compared sorted `Baseboard`s, rather than comparing addresses. This resulted in all the `StaleXXX messages` and made it so that the peer fsm would not actually discover that it was connected on the initiating side because of an early return. This would result in retry messages not getting sent from the FSM. This is the most likely cause of the RSS failure on dogfood as error messages occur around rack init time. While looking at the code I also noticed two other bugs, that I don't believe we've necessarily hit. These are loss of tracking connections when an *actual* stale message is received.
1 parent 411c1e7 commit 385ff29

File tree

1 file changed

+25
-14
lines changed

1 file changed

+25
-14
lines changed

bootstore/src/schemes/v0/peer.rs

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,19 @@ impl Node {
392392
return;
393393
};
394394
// Remove any existing connection
395-
self.remove_accepted_connection(&addr).await;
395+
if let Some(handle) = self.accepted_connections.remove(&addr) {
396+
info!(
397+
self.log,
398+
concat!(
399+
"Removing acccepted connection from {}: ",
400+
"new connection accepted from same address"
401+
),
402+
addr
403+
);
404+
405+
// The connection has not yet completed its handshake
406+
let _ = handle.tx.send(MainToConnMsg::Close).await;
407+
}
396408
info!(self.log, "Accepted connection from {addr}");
397409
self.handle_unique_id_counter += 1;
398410
let handle = spawn_accepted_connection_management_task(
@@ -765,9 +777,12 @@ impl Node {
765777
return;
766778
};
767779

768-
// Ignore the stale message if the unique_id doesn't match what
769-
// we have stored.
780+
// Put back the non-matching connection we removed
781+
// The received message is stale, so we return.
770782
if unique_id != accepted_handle.unique_id {
783+
self.accepted_connections
784+
.insert(accepted_addr, accepted_handle);
785+
771786
return;
772787
}
773788

@@ -803,9 +818,10 @@ impl Node {
803818
ConnToMainMsgInner::ConnectedInitiator { addr, peer_id } => {
804819
if let Some(handle) = self.initiating_connections.remove(&addr)
805820
{
806-
// Ignore the stale message if the unique_id doesn't match what
807-
// we have stored.
821+
// Put back the non-matching connection we removed
822+
// The received message is stale, so we return.
808823
if unique_id != handle.unique_id {
824+
self.initiating_connections.insert(addr, handle);
809825
return;
810826
}
811827

@@ -859,8 +875,10 @@ impl Node {
859875
warn!(self.log, "peer disconnected {peer_id}");
860876
let handle =
861877
self.established_connections.remove(&peer_id).unwrap();
862-
if peer_id < self.config.id {
863-
// Put the connection handle back in initiating state
878+
// We always connect to peers lower than ourselves, and the
879+
// connecting task never exits, it just loops. Therefore we know
880+
// that we are initiating this connection again.
881+
if handle.addr < self.config.addr {
864882
self.initiating_connections.insert(handle.addr, handle);
865883
}
866884
self.fsm.on_disconnected(&peer_id);
@@ -946,13 +964,6 @@ impl Node {
946964
}
947965
}
948966

949-
async fn remove_accepted_connection(&mut self, addr: &SocketAddrV6) {
950-
if let Some(handle) = self.accepted_connections.remove(&addr) {
951-
// The connection has not yet completed its handshake
952-
let _ = handle.tx.send(MainToConnMsg::Close).await;
953-
}
954-
}
955-
956967
async fn remove_peer(&mut self, addr: SocketAddrV6) {
957968
if let Some(handle) = self.initiating_connections.remove(&addr) {
958969
// The connection has not yet completed its handshake

0 commit comments

Comments
 (0)