Skip to content

Commit 54564ff

Browse files
cskiralyfjl
authored andcommitted
p2p: fix dial metrics not picking up the right error (ethereum#31621)
Our metrics related to dial errors were off. The original error was not wrapped, so the caller function had no chance of picking it up. Therefore the most common error, which is "TooManyPeers", was not correctly counted. The metrics were originally introduced in ethereum#27621 I was thinking of various possible solutions. - the one proposed here wraps both the new error and the origial error. It is not a pattern we use in other parts of the code, but works. This is maybe the smallest possible change. - as an alternate, I could write a proper `errProtoHandshakeError` with it's own wrapped error - finally, I'm not even sure we need `errProtoHandshakeError`, maybe we could just pass up the original error. --------- Signed-off-by: Csaba Kiraly <csaba.kiraly@gmail.com> Co-authored-by: Felix Lange <fjl@twurst.com>
1 parent 27286a7 commit 54564ff

File tree

3 files changed

+34
-23
lines changed

3 files changed

+34
-23
lines changed

p2p/metrics.go

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ var (
4949
serveSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success", nil)
5050
dialMeter = metrics.NewRegisteredMeter("p2p/dials", nil)
5151
dialSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success", nil)
52-
dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil)
52+
dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil) // dial timeout; no route to host; connection refused; network is unreachable
5353

5454
// count peers that stayed connected for at least 1 min
5555
serve1MinSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success/1min", nil)
@@ -61,34 +61,41 @@ var (
6161
dialSelf = metrics.NewRegisteredMeter("p2p/dials/error/self", nil)
6262
dialUselessPeer = metrics.NewRegisteredMeter("p2p/dials/error/useless", nil)
6363
dialUnexpectedIdentity = metrics.NewRegisteredMeter("p2p/dials/error/id/unexpected", nil)
64-
dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil)
65-
dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil)
64+
dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil) // EOF; connection reset during handshake; message too big; i/o timeout
65+
dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil) // EOF
66+
67+
// capture the rest of errors that are not handled by the above meters
68+
dialOtherError = metrics.NewRegisteredMeter("p2p/dials/error/other", nil)
6669
)
6770

68-
// markDialError matches errors that occur while setting up a dial connection
69-
// to the corresponding meter.
71+
// markDialError matches errors that occur while setting up a dial connection to the
72+
// corresponding meter. We don't maintain meters for evert possible error, just for
73+
// the most interesting ones.
7074
func markDialError(err error) {
7175
if !metrics.Enabled() {
7276
return
7377
}
74-
if err2 := errors.Unwrap(err); err2 != nil {
75-
err = err2
76-
}
77-
switch err {
78-
case DiscTooManyPeers:
78+
79+
var reason DiscReason
80+
var handshakeErr *protoHandshakeError
81+
d := errors.As(err, &reason)
82+
switch {
83+
case d && reason == DiscTooManyPeers:
7984
dialTooManyPeers.Mark(1)
80-
case DiscAlreadyConnected:
85+
case d && reason == DiscAlreadyConnected:
8186
dialAlreadyConnected.Mark(1)
82-
case DiscSelf:
87+
case d && reason == DiscSelf:
8388
dialSelf.Mark(1)
84-
case DiscUselessPeer:
89+
case d && reason == DiscUselessPeer:
8590
dialUselessPeer.Mark(1)
86-
case DiscUnexpectedIdentity:
91+
case d && reason == DiscUnexpectedIdentity:
8792
dialUnexpectedIdentity.Mark(1)
88-
case errEncHandshakeError:
89-
dialEncHandshakeError.Mark(1)
90-
case errProtoHandshakeError:
93+
case errors.As(err, &handshakeErr):
9194
dialProtoHandshakeError.Mark(1)
95+
case errors.Is(err, errEncHandshakeError):
96+
dialEncHandshakeError.Mark(1)
97+
default:
98+
dialOtherError.Mark(1)
9299
}
93100
}
94101

p2p/server.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,15 @@ const (
6666
)
6767

6868
var (
69-
errServerStopped = errors.New("server stopped")
70-
errEncHandshakeError = errors.New("rlpx enc error")
71-
errProtoHandshakeError = errors.New("rlpx proto error")
69+
errServerStopped = errors.New("server stopped")
70+
errEncHandshakeError = errors.New("rlpx enc error")
7271
)
7372

73+
type protoHandshakeError struct{ err error }
74+
75+
func (e *protoHandshakeError) Error() string { return fmt.Sprintf("rlpx proto error: %v", e.err) }
76+
func (e *protoHandshakeError) Unwrap() error { return e.err }
77+
7478
// Server manages all peer connections.
7579
type Server struct {
7680
// Config fields may not be modified while the server is running.
@@ -907,7 +911,7 @@ func (srv *Server) setupConn(c *conn, dialDest *enode.Node) error {
907911
phs, err := c.doProtoHandshake(srv.ourHandshake)
908912
if err != nil {
909913
clog.Trace("Failed p2p handshake", "err", err)
910-
return fmt.Errorf("%w: %v", errProtoHandshakeError, err)
914+
return &protoHandshakeError{err: err}
911915
}
912916
if id := c.node.ID(); !bytes.Equal(crypto.Keccak256(phs.ID), id[:]) {
913917
clog.Trace("Wrong devp2p handshake identity", "phsid", hex.EncodeToString(phs.ID))

p2p/server_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -410,11 +410,11 @@ func TestServerSetupConn(t *testing.T) {
410410
wantCloseErr: DiscUnexpectedIdentity,
411411
},
412412
{
413-
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: errProtoHandshakeError},
413+
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: DiscTooManyPeers},
414414
dialDest: enode.NewV4(clientpub, nil, 0, 0),
415415
flags: dynDialedConn,
416416
wantCalls: "doEncHandshake,doProtoHandshake,close,",
417-
wantCloseErr: errProtoHandshakeError,
417+
wantCloseErr: DiscTooManyPeers,
418418
},
419419
{
420420
tt: &setupTransport{pubkey: srvpub, phs: protoHandshake{ID: crypto.FromECDSAPub(srvpub)[1:]}},

0 commit comments

Comments
 (0)