Skip to content

Commit 3493b08

Browse files
authored
Merge pull request #9978 from ziggie1984/fix-peer-connection
multi: fix deadlock in p2p race condition
2 parents 5f00be2 + 24e3958 commit 3493b08

File tree

4 files changed

+69
-37
lines changed

4 files changed

+69
-37
lines changed

docs/release-notes/release-notes-0.19.2.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929
- [Fixed](https://github.com/lightningnetwork/lnd/pull/9962) a case where the
3030
node may panic if it's running in the remote signer mode.
3131

32+
- [Fixed](https://github.com/lightningnetwork/lnd/pull/9978) a deadlock which
33+
can happen when the peer start-up has not yet completed but a another p2p
34+
connection attempt tries to disconnect the peer.
35+
3236
# New Features
3337

3438
## Functional Enhancements

peer/brontide.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1591,6 +1591,10 @@ func (p *Brontide) WaitForDisconnect(ready chan struct{}) {
15911591
// Disconnect terminates the connection with the remote peer. Additionally, a
15921592
// signal is sent to the server and htlcSwitch indicating the resources
15931593
// allocated to the peer can now be cleaned up.
1594+
//
1595+
// NOTE: Be aware that this method will block if the peer is still starting up.
1596+
// Therefore consider starting it in a goroutine if you cannot guarantee that
1597+
// the peer has finished starting up before calling this method.
15941598
func (p *Brontide) Disconnect(reason error) {
15951599
if !atomic.CompareAndSwapInt32(&p.disconnect, 0, 1) {
15961600
return
@@ -1603,7 +1607,8 @@ func (p *Brontide) Disconnect(reason error) {
16031607
// started, otherwise we will skip reading it as this chan won't be
16041608
// closed, hence blocks forever.
16051609
if atomic.LoadInt32(&p.started) == 1 {
1606-
p.log.Debugf("Started, waiting on startReady signal")
1610+
p.log.Debugf("Peer hasn't finished starting up yet, waiting " +
1611+
"on startReady signal before closing connection")
16071612

16081613
select {
16091614
case <-p.startReady:

rpcserver.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1870,11 +1870,13 @@ func (r *rpcServer) DisconnectPeer(ctx context.Context,
18701870

18711871
// In order to avoid erroneously disconnecting from a peer that we have
18721872
// an active channel with, if we have any channels active with this
1873-
// peer, then we'll disallow disconnecting from them.
1873+
// peer, then we'll disallow disconnecting from them in certain
1874+
// situations.
18741875
if len(nodeChannels) != 0 {
1875-
// If we are not in a dev environment or the configed dev value
1876-
// `unsafedisconnect` is false, we return an error since there
1877-
// are active channels.
1876+
// If the configured dev value `unsafedisconnect` is false, we
1877+
// return an error since there are active channels. For
1878+
// production environments, we allow disconnecting from a peer
1879+
// even if there are channels active with them.
18781880
if !r.cfg.Dev.GetUnsafeDisconnect() {
18791881
return nil, fmt.Errorf("cannot disconnect from "+
18801882
"peer(%x), still has %d active channels",

server.go

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4091,7 +4091,7 @@ func (s *server) InboundPeerConnected(conn net.Conn) {
40914091
// Remove the current peer from the server's internal state and
40924092
// signal that the peer termination watcher does not need to
40934093
// execute for this peer.
4094-
s.removePeer(connectedPeer)
4094+
s.removePeerUnsafe(connectedPeer)
40954095
s.ignorePeerTermination[connectedPeer] = struct{}{}
40964096
s.scheduledPeerConnection[pubStr] = func() {
40974097
s.peerConnected(conn, nil, true)
@@ -4208,7 +4208,7 @@ func (s *server) OutboundPeerConnected(connReq *connmgr.ConnReq, conn net.Conn)
42084208
// Remove the current peer from the server's internal state and
42094209
// signal that the peer termination watcher does not need to
42104210
// execute for this peer.
4211-
s.removePeer(connectedPeer)
4211+
s.removePeerUnsafe(connectedPeer)
42124212
s.ignorePeerTermination[connectedPeer] = struct{}{}
42134213
s.scheduledPeerConnection[pubStr] = func() {
42144214
s.peerConnected(conn, connReq, false)
@@ -4730,7 +4730,7 @@ func (s *server) peerTerminationWatcher(p *peer.Brontide, ready chan struct{}) {
47304730

47314731
// First, cleanup any remaining state the server has regarding the peer
47324732
// in question.
4733-
s.removePeer(p)
4733+
s.removePeerUnsafe(p)
47344734

47354735
// Next, check to see if this is a persistent peer or not.
47364736
if _, ok := s.persistentPeers[pubStr]; !ok {
@@ -4939,29 +4939,24 @@ func (s *server) connectToPersistentPeer(pubKeyStr string) {
49394939
}()
49404940
}
49414941

4942-
// removePeer removes the passed peer from the server's state of all active
4943-
// peers.
4944-
func (s *server) removePeer(p *peer.Brontide) {
4942+
// removePeerUnsafe removes the passed peer from the server's state of all
4943+
// active peers.
4944+
//
4945+
// NOTE: Server mutex must be held when calling this function.
4946+
func (s *server) removePeerUnsafe(p *peer.Brontide) {
49454947
if p == nil {
49464948
return
49474949
}
49484950

4949-
srvrLog.Debugf("removing peer %v", p)
4950-
4951-
// As the peer is now finished, ensure that the TCP connection is
4952-
// closed and all of its related goroutines have exited.
4953-
p.Disconnect(fmt.Errorf("server: disconnecting peer %v", p))
4954-
4955-
// If this peer had an active persistent connection request, remove it.
4956-
if p.ConnReq() != nil {
4957-
s.connMgr.Remove(p.ConnReq().ID())
4958-
}
4951+
srvrLog.Debugf("Removing peer %v", p)
49594952

4960-
// Ignore deleting peers if we're shutting down.
4953+
// Exit early if we have already been instructed to shutdown, the peers
4954+
// will be disconnected in the server shutdown process.
49614955
if s.Stopped() {
49624956
return
49634957
}
49644958

4959+
// Capture the peer's public key and string representation.
49654960
pKey := p.PubKey()
49664961
pubSer := pKey[:]
49674962
pubStr := string(pubSer)
@@ -4974,22 +4969,45 @@ func (s *server) removePeer(p *peer.Brontide) {
49744969
delete(s.outboundPeers, pubStr)
49754970
}
49764971

4977-
// Remove the peer's access permission from the access manager.
4978-
peerPubStr := string(p.IdentityKey().SerializeCompressed())
4979-
s.peerAccessMan.removePeerAccess(peerPubStr)
4972+
// When removing the peer we make sure to disconnect it asynchronously
4973+
// to avoid blocking the main server goroutine because it is holding the
4974+
// server's mutex. Disconnecting the peer might block and wait until the
4975+
// peer has fully started up. This can happen if an inbound and outbound
4976+
// race condition occurs.
4977+
s.wg.Add(1)
4978+
go func() {
4979+
defer s.wg.Done()
49804980

4981-
// Copy the peer's error buffer across to the server if it has any items
4982-
// in it so that we can restore peer errors across connections.
4983-
if p.ErrorBuffer().Total() > 0 {
4984-
s.peerErrors[pubStr] = p.ErrorBuffer()
4985-
}
4981+
p.Disconnect(fmt.Errorf("server: disconnecting peer %v", p))
49864982

4987-
// Inform the peer notifier of a peer offline event so that it can be
4988-
// reported to clients listening for peer events.
4989-
var pubKey [33]byte
4990-
copy(pubKey[:], pubSer)
4983+
// If this peer had an active persistent connection request,
4984+
// remove it.
4985+
if p.ConnReq() != nil {
4986+
s.connMgr.Remove(p.ConnReq().ID())
4987+
}
4988+
4989+
// Remove the peer's access permission from the access manager.
4990+
peerPubStr := string(p.IdentityKey().SerializeCompressed())
4991+
s.peerAccessMan.removePeerAccess(peerPubStr)
4992+
4993+
// Copy the peer's error buffer across to the server if it has
4994+
// any items in it so that we can restore peer errors across
4995+
// connections. We need to look up the error after the peer has
4996+
// been disconnected because we write the error in the
4997+
// `Disconnect` method.
4998+
s.mu.Lock()
4999+
if p.ErrorBuffer().Total() > 0 {
5000+
s.peerErrors[pubStr] = p.ErrorBuffer()
5001+
}
5002+
s.mu.Unlock()
5003+
5004+
// Inform the peer notifier of a peer offline event so that it
5005+
// can be reported to clients listening for peer events.
5006+
var pubKey [33]byte
5007+
copy(pubKey[:], pubSer)
49915008

4992-
s.peerNotifier.NotifyPeerOffline(pubKey)
5009+
s.peerNotifier.NotifyPeerOffline(pubKey)
5010+
}()
49935011
}
49945012

49955013
// ConnectToPeer requests that the server connect to a Lightning Network peer
@@ -5129,8 +5147,11 @@ func (s *server) DisconnectPeer(pubKey *btcec.PublicKey) error {
51295147
delete(s.persistentPeersBackoff, pubStr)
51305148

51315149
// Remove the peer by calling Disconnect. Previously this was done with
5132-
// removePeer, which bypassed the peerTerminationWatcher.
5133-
peer.Disconnect(fmt.Errorf("server: DisconnectPeer called"))
5150+
// removePeerUnsafe, which bypassed the peerTerminationWatcher.
5151+
//
5152+
// NOTE: We call it in a goroutine to avoid blocking the main server
5153+
// goroutine because we might hold the server's mutex.
5154+
go peer.Disconnect(fmt.Errorf("server: DisconnectPeer called"))
51345155

51355156
return nil
51365157
}

0 commit comments

Comments
 (0)