Skip to content

Commit dce8b39

Browse files
authored
feat: track libp2p dial error (#7712)
**Motivation** - to know the reason why we cannot dial a peer. In `peerdas-devnet-6` it happens a lot cc @matthewkeil **Description** - model all possible libp2p errors - do not dial a peer if it has no multiaddr, this follows up #7708 - new metric `notDialReason` in Discovery which I bring from `peerDAS` branch **Testing** This is how it showed for `peerdas-devnet-6` <img width="1540" alt="Screenshot 2025-04-17 at 13 16 09" src="https://github.com/user-attachments/assets/5fca773c-3ba6-42e3-a41e-be963d25d774" /> Co-authored-by: Tuyen Nguyen <twoeths@users.noreply.github.com>
1 parent 713062c commit dce8b39

File tree

3 files changed

+80
-2
lines changed

3 files changed

+80
-2
lines changed

packages/beacon-node/src/network/core/metrics.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import {SubnetID} from "@lodestar/types";
22
import {RegistryMetricCreator} from "../../metrics/utils/registryMetricCreator.js";
3+
import {Libp2pError} from "../libp2p/error.js";
34
import {SubnetType} from "../metadata.js";
4-
import {DiscoveredPeerStatus} from "../peers/discover.js";
5+
import {DiscoveredPeerStatus, NotDialReason} from "../peers/discover.js";
56
import {SubnetSource} from "../subnets/attnetsService.js";
67

78
export type NetworkCoreMetrics = ReturnType<typeof createNetworkCoreMetrics>;
@@ -159,12 +160,22 @@ export function createNetworkCoreMetrics(register: RegistryMetricCreator) {
159160
name: "lodestar_discovery_total_dial_attempts",
160161
help: "Total dial attempts by peer discovery",
161162
}),
163+
notDialReason: register.gauge<{reason: NotDialReason}>({
164+
name: "lodestar_discovery_not_dial_reason_total_count",
165+
help: "Total count of not dial reasons",
166+
labelNames: ["reason"],
167+
}),
162168
dialTime: register.histogram<{status: string}>({
163169
name: "lodestar_discovery_dial_time_seconds",
164170
help: "Time to dial peers in seconds",
165171
labelNames: ["status"],
166172
buckets: [0.1, 5, 60],
167173
}),
174+
dialError: register.gauge<{reason: Libp2pError}>({
175+
name: "lodestar_discovery_dial_error_total_count",
176+
help: "Total count of dial errors",
177+
labelNames: ["reason"],
178+
}),
168179
},
169180

170181
reqResp: {
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
export enum Libp2pError {
2+
// https://github.com/libp2p/js-libp2p/blob/b936324a92038d9447983bb11db623c1b5a17a76/packages/libp2p/src/errors.ts#L6
3+
MissingServiceError = "MissingServiceError",
4+
UnmetServiceDependenciesError = "UnmetServiceDependenciesError",
5+
NoContentRoutersError = "NoContentRoutersError",
6+
NoPeerRoutersError = "NoPeerRoutersError",
7+
QueriedForSelfError = "QueriedForSelfError",
8+
UnhandledProtocolError = "UnhandledProtocolError",
9+
DuplicateProtocolHandlerError = "DuplicateProtocolHandlerError",
10+
DialDeniedError = "DialDeniedError",
11+
UnsupportedListenAddressError = "UnsupportedListenAddressError",
12+
UnsupportedListenAddressesError = "UnsupportedListenAddressesError",
13+
NoValidAddressesError = "NoValidAddressesError",
14+
ConnectionInterceptedError = "ConnectionInterceptedError",
15+
ConnectionDeniedError = "ConnectionDeniedError",
16+
MuxerUnavailableError = "MuxerUnavailableError",
17+
EncryptionFailedError = "EncryptionFailedError",
18+
TransportUnavailableError = "TransportUnavailableError",
19+
// https://github.com/libp2p/js-libp2p/blob/b936324a92038d9447983bb11db623c1b5a17a76/packages/interface/src/errors.ts#L6
20+
AbortError = "AbortError",
21+
UnexpectedPeerError = "UnexpectedPeerError",
22+
InvalidCryptoExchangeError = "InvalidCryptoExchangeError",
23+
InvalidParametersError = "InvalidParametersError",
24+
InvalidPublicKeyError = "InvalidPublicKeyError",
25+
InvalidPrivateKeyError = "InvalidPrivateKeyError",
26+
UnsupportedOperationError = "UnsupportedOperationError",
27+
ConnectionClosingError = "ConnectionClosingError",
28+
ConnectionClosedError = "ConnectionClosedError",
29+
ConnectionFailedError = "ConnectionFailedError",
30+
MuxerClosedError = "MuxerClosedError",
31+
StreamResetError = "StreamResetError",
32+
StreamStateError = "StreamStateError",
33+
NotFoundError = "NotFoundError",
34+
InvalidPeerIdError = "InvalidPeerIdError",
35+
InvalidMultiaddrError = "InvalidMultiaddrError",
36+
InvalidCidError = "InvalidCIDError",
37+
InvalidMultihashError = "InvalidMultihashError",
38+
UnsupportedProtocolError = "UnsupportedProtocolError",
39+
InvalidMessageError = "InvalidMessageError",
40+
ProtocolError = "ProtocolError",
41+
TimeoutError = "TimeoutError",
42+
NotStartedError = "NotStartedError",
43+
AlreadyStartedError = "AlreadyStartedError",
44+
DialError = "DialError",
45+
ListenError = "ListenError",
46+
LimitedConnectionError = "LimitedConnectionError",
47+
TooManyInboundProtocolStreamsError = "TooManyInboundProtocolStreamsError",
48+
TooManyOutboundProtocolStreamsError = "TooManyOutboundProtocolStreamsError",
49+
UnsupportedKeyTypeError = "UnsupportedKeyTypeError",
50+
OtherError = "OtherError",
51+
}
52+
53+
export function getLibp2pError(error: Error): Libp2pError {
54+
return Libp2pError[(error as unknown as {name: string}).name as keyof typeof Libp2pError] ?? Libp2pError.OtherError;
55+
}

packages/beacon-node/src/network/peers/discover.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {NetworkCoreMetrics} from "../core/metrics.js";
1010
import {Discv5Worker} from "../discv5/index.js";
1111
import {LodestarDiscv5Opts} from "../discv5/types.js";
1212
import {Libp2p} from "../interface.js";
13+
import {getLibp2pError} from "../libp2p/error.js";
1314
import {ENRKey, SubnetType} from "../metadata.js";
1415
import {getConnectionsMap, prettyPrintPeerId} from "../util.js";
1516
import {IPeerRpcScoreStore, ScoreState} from "./score/index.js";
@@ -55,6 +56,12 @@ export enum DiscoveredPeerStatus {
5556
no_multiaddrs = "no_multiaddrs",
5657
}
5758

59+
export enum NotDialReason {
60+
not_contain_requested_sampling_groups = "not_contain_requested_sampling_groups",
61+
not_contain_requested_attnet_syncnet_subnets = "not_contain_requested_attnet_syncnet_subnets",
62+
no_multiaddrs = "no_multiaddrs",
63+
}
64+
5865
type UnixMs = number;
5966
/**
6067
* Maintain peersToConnect to avoid having too many topic peers at some point.
@@ -448,7 +455,11 @@ export class PeerDiscovery {
448455

449456
// Must add the multiaddrs array to the address book before dialing
450457
// https://github.com/libp2p/js-libp2p/blob/aec8e3d3bb1b245051b60c2a890550d262d5b062/src/index.js#L638
451-
await this.libp2p.peerStore.merge(peerId, {multiaddrs: [multiaddrTCP]});
458+
const peer = await this.libp2p.peerStore.merge(peerId, {multiaddrs: [multiaddrTCP]});
459+
if (peer.addresses.length === 0) {
460+
this.metrics?.discovery.notDialReason.inc({reason: NotDialReason.no_multiaddrs});
461+
return;
462+
}
452463

453464
// Note: PeerDiscovery adds the multiaddrTCP beforehand
454465
const peerIdShort = prettyPrintPeerId(peerId);
@@ -466,6 +477,7 @@ export class PeerDiscovery {
466477
} catch (e) {
467478
timer?.({status: "error"});
468479
formatLibp2pDialError(e as Error);
480+
this.metrics?.discovery.dialError.inc({reason: getLibp2pError(e as Error)});
469481
this.logger.debug("Error dialing discovered peer", {peer: peerIdShort}, e as Error);
470482
}
471483
}

0 commit comments

Comments
 (0)