Skip to content

Commit 3329308

Browse files
committed
multi: fix deadlock in p2p race condition
In case we cannot guarantee that the peers has started up completely we launch the disconnect method asynchronously.
1 parent 6b80796 commit 3329308

File tree

2 files changed

+52
-26
lines changed

2 files changed

+52
-26
lines changed

peer/brontide.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1591,6 +1591,10 @@ func (p *Brontide) WaitForDisconnect(ready chan struct{}) {
15911591
// Disconnect terminates the connection with the remote peer. Additionally, a
15921592
// signal is sent to the server and htlcSwitch indicating the resources
15931593
// allocated to the peer can now be cleaned up.
1594+
//
1595+
// NOTE: Be aware that this method will block if the peer is still starting up.
1596+
// Therefore consider starting it in a goroutine if you cannot guarantee that
1597+
// the peer has finished starting up before calling this method.
15941598
func (p *Brontide) Disconnect(reason error) {
15951599
if !atomic.CompareAndSwapInt32(&p.disconnect, 0, 1) {
15961600
return
@@ -1603,7 +1607,8 @@ func (p *Brontide) Disconnect(reason error) {
16031607
// started, otherwise we will skip reading it as this chan won't be
16041608
// closed, hence blocks forever.
16051609
if atomic.LoadInt32(&p.started) == 1 {
1606-
p.log.Debugf("Started, waiting on startReady signal")
1610+
p.log.Debugf("Peer hasn't finished starting up yet, waiting " +
1611+
"on startReady signal before closing connection")
16071612

16081613
select {
16091614
case <-p.startReady:

server.go

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4941,27 +4941,22 @@ func (s *server) connectToPersistentPeer(pubKeyStr string) {
49414941

49424942
// removePeer removes the passed peer from the server's state of all active
49434943
// peers.
4944+
//
4945+
// NOTE: Server mutex must be held when calling this function.
49444946
func (s *server) removePeer(p *peer.Brontide) {
49454947
if p == nil {
49464948
return
49474949
}
49484950

4949-
srvrLog.Debugf("removing peer %v", p)
4950-
4951-
// As the peer is now finished, ensure that the TCP connection is
4952-
// closed and all of its related goroutines have exited.
4953-
p.Disconnect(fmt.Errorf("server: disconnecting peer %v", p))
4954-
4955-
// If this peer had an active persistent connection request, remove it.
4956-
if p.ConnReq() != nil {
4957-
s.connMgr.Remove(p.ConnReq().ID())
4958-
}
4951+
srvrLog.Debugf("Removing peer %v", p)
49594952

4960-
// Ignore deleting peers if we're shutting down.
4953+
// Exit early if we have already been instructed to shutdown, the peers
4954+
// will be disconnected in the server shutdown process.
49614955
if s.Stopped() {
49624956
return
49634957
}
49644958

4959+
// Capture the peer's public key and string representation.
49654960
pKey := p.PubKey()
49664961
pubSer := pKey[:]
49674962
pubStr := string(pubSer)
@@ -4974,22 +4969,45 @@ func (s *server) removePeer(p *peer.Brontide) {
49744969
delete(s.outboundPeers, pubStr)
49754970
}
49764971

4977-
// Remove the peer's access permission from the access manager.
4978-
peerPubStr := string(p.IdentityKey().SerializeCompressed())
4979-
s.peerAccessMan.removePeerAccess(peerPubStr)
4972+
// When removing the peer we make sure to disconnect it asynchronously
4973+
// to avoid blocking the main server goroutine because it is holding the
4974+
// server's mutex. Disconnecting the peer might block and wait until the
4975+
// peer has fully started up. This can happen if an inbound and outbound
4976+
// race condition occurs.
4977+
s.wg.Add(1)
4978+
go func() {
4979+
defer s.wg.Done()
49804980

4981-
// Copy the peer's error buffer across to the server if it has any items
4982-
// in it so that we can restore peer errors across connections.
4983-
if p.ErrorBuffer().Total() > 0 {
4984-
s.peerErrors[pubStr] = p.ErrorBuffer()
4985-
}
4981+
p.Disconnect(fmt.Errorf("server: disconnecting peer %v", p))
49864982

4987-
// Inform the peer notifier of a peer offline event so that it can be
4988-
// reported to clients listening for peer events.
4989-
var pubKey [33]byte
4990-
copy(pubKey[:], pubSer)
4983+
// If this peer had an active persistent connection request,
4984+
// remove it.
4985+
if p.ConnReq() != nil {
4986+
s.connMgr.Remove(p.ConnReq().ID())
4987+
}
4988+
4989+
// Remove the peer's access permission from the access manager.
4990+
peerPubStr := string(p.IdentityKey().SerializeCompressed())
4991+
s.peerAccessMan.removePeerAccess(peerPubStr)
4992+
4993+
// Copy the peer's error buffer across to the server if it has
4994+
// any items in it so that we can restore peer errors across
4995+
// connections. We need to look up the error after the peer has
4996+
// been disconnected because we write the error in the
4997+
// `Disconnect` method.
4998+
s.mu.Lock()
4999+
if p.ErrorBuffer().Total() > 0 {
5000+
s.peerErrors[pubStr] = p.ErrorBuffer()
5001+
}
5002+
s.mu.Unlock()
5003+
5004+
// Inform the peer notifier of a peer offline event so that it
5005+
// can be reported to clients listening for peer events.
5006+
var pubKey [33]byte
5007+
copy(pubKey[:], pubSer)
49915008

4992-
s.peerNotifier.NotifyPeerOffline(pubKey)
5009+
s.peerNotifier.NotifyPeerOffline(pubKey)
5010+
}()
49935011
}
49945012

49955013
// ConnectToPeer requests that the server connect to a Lightning Network peer
@@ -5130,7 +5148,10 @@ func (s *server) DisconnectPeer(pubKey *btcec.PublicKey) error {
51305148

51315149
// Remove the peer by calling Disconnect. Previously this was done with
51325150
// removePeer, which bypassed the peerTerminationWatcher.
5133-
peer.Disconnect(fmt.Errorf("server: DisconnectPeer called"))
5151+
//
5152+
// NOTE: We call it in a goroutine to avoid blocking the main server
5153+
// goroutine because we might hold the server's mutex.
5154+
go peer.Disconnect(fmt.Errorf("server: DisconnectPeer called"))
51345155

51355156
return nil
51365157
}

0 commit comments

Comments
 (0)