Skip to content

p2p: fix dial metrics not picking up the right error #31621

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 24 additions & 17 deletions p2p/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,42 +49,49 @@ var (
serveSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success", nil)
dialMeter = metrics.NewRegisteredMeter("p2p/dials", nil)
dialSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success", nil)
dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil)
dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil) // dial timeout; no route to host; connection refused; network is unreachable

// handshake error meters
dialTooManyPeers = metrics.NewRegisteredMeter("p2p/dials/error/saturated", nil)
dialAlreadyConnected = metrics.NewRegisteredMeter("p2p/dials/error/known", nil)
dialSelf = metrics.NewRegisteredMeter("p2p/dials/error/self", nil)
dialUselessPeer = metrics.NewRegisteredMeter("p2p/dials/error/useless", nil)
dialUnexpectedIdentity = metrics.NewRegisteredMeter("p2p/dials/error/id/unexpected", nil)
dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil)
dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil)
dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil) // EOF; connection reset during handshake; message too big; i/o timeout
dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil) // EOF

// capture the rest of errors that are not handled by the above meters
dialOtherError = metrics.NewRegisteredMeter("p2p/dials/error/other", nil)
)

// markDialError matches errors that occur while setting up a dial connection
// to the corresponding meter.
// markDialError matches errors that occur while setting up a dial connection to the
// corresponding meter. We don't maintain meters for evert possible error, just for
// the most interesting ones.
func markDialError(err error) {
if !metrics.Enabled() {
return
}
if err2 := errors.Unwrap(err); err2 != nil {
Copy link
Contributor

@fjl fjl Apr 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what we really want here is

var reason DiscReason
if errors.As(err, &reason) {
    switch reason {
    ...
    }
}
var phe *protoHandshakeError
if errors.As(err, &phe) {
    dialProtoHandshakeError.Mark(1)
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds reasonable, and also cleaner. I will verify how it works, and come back with a patch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dialProtoHandshakeError would have to be in an else branch, and we would still need to handle dialEncHandshakeError separately.
I would say the original with Is and As is better. I've added a catch-all and some comments to make it even cleaner.

err = err2
}
switch err {
case DiscTooManyPeers:

var reason DiscReason
var handshakeErr *protoHandshakeError
d := errors.As(err, &reason)
switch {
case d && reason == DiscTooManyPeers:
dialTooManyPeers.Mark(1)
case DiscAlreadyConnected:
case d && reason == DiscAlreadyConnected:
dialAlreadyConnected.Mark(1)
case DiscSelf:
case d && reason == DiscSelf:
dialSelf.Mark(1)
case DiscUselessPeer:
case d && reason == DiscUselessPeer:
dialUselessPeer.Mark(1)
case DiscUnexpectedIdentity:
case d && reason == DiscUnexpectedIdentity:
dialUnexpectedIdentity.Mark(1)
case errEncHandshakeError:
dialEncHandshakeError.Mark(1)
case errProtoHandshakeError:
case errors.As(err, &handshakeErr):
dialProtoHandshakeError.Mark(1)
case errors.Is(err, errEncHandshakeError):
dialEncHandshakeError.Mark(1)
default:
dialOtherError.Mark(1)
}
}

Expand Down
12 changes: 8 additions & 4 deletions p2p/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,15 @@ const (
)

var (
errServerStopped = errors.New("server stopped")
errEncHandshakeError = errors.New("rlpx enc error")
errProtoHandshakeError = errors.New("rlpx proto error")
errServerStopped = errors.New("server stopped")
errEncHandshakeError = errors.New("rlpx enc error")
)

type protoHandshakeError struct{ err error }

func (e *protoHandshakeError) Error() string { return fmt.Sprintf("rlpx proto error: %v", e.err) }
func (e *protoHandshakeError) Unwrap() error { return e.err }

// Server manages all peer connections.
type Server struct {
// Config fields may not be modified while the server is running.
Expand Down Expand Up @@ -907,7 +911,7 @@ func (srv *Server) setupConn(c *conn, dialDest *enode.Node) error {
phs, err := c.doProtoHandshake(srv.ourHandshake)
if err != nil {
clog.Trace("Failed p2p handshake", "err", err)
return fmt.Errorf("%w: %v", errProtoHandshakeError, err)
return &protoHandshakeError{err: err}
}
if id := c.node.ID(); !bytes.Equal(crypto.Keccak256(phs.ID), id[:]) {
clog.Trace("Wrong devp2p handshake identity", "phsid", hex.EncodeToString(phs.ID))
Expand Down
4 changes: 2 additions & 2 deletions p2p/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -410,11 +410,11 @@ func TestServerSetupConn(t *testing.T) {
wantCloseErr: DiscUnexpectedIdentity,
},
{
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: errProtoHandshakeError},
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: DiscTooManyPeers},
dialDest: enode.NewV4(clientpub, nil, 0, 0),
flags: dynDialedConn,
wantCalls: "doEncHandshake,doProtoHandshake,close,",
wantCloseErr: errProtoHandshakeError,
wantCloseErr: DiscTooManyPeers,
},
{
tt: &setupTransport{pubkey: srvpub, phs: protoHandshake{ID: crypto.FromECDSAPub(srvpub)[1:]}},
Expand Down