From 9ee5fc6551d939e26d32a77a5c0e50175fc0b6cb Mon Sep 17 00:00:00 2001 From: Csaba Kiraly Date: Mon, 14 Apr 2025 10:13:45 +0200 Subject: [PATCH] p2p: better dial/serve success metrics (#31629) Our previous success metrics gave success even if a peer disconnected right after connection. These metrics only count peers that stayed connected for at least 1 min. The 1 min limit is an arbitrary choice. We do not use this for decision logic, only statistics. --- p2p/metrics.go | 4 ++++ p2p/peer.go | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/p2p/metrics.go b/p2p/metrics.go index 76982b1a8b..7eaa7977d0 100644 --- a/p2p/metrics.go +++ b/p2p/metrics.go @@ -51,6 +51,10 @@ var ( dialSuccessMeter metrics.Meter = metrics.NilMeter{} dialConnectionError metrics.Meter = metrics.NilMeter{} + // count peers that stayed connected for at least 1 min + serve1MinSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success/1min", nil) + dial1MinSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success/1min", nil) + // handshake error meters dialTooManyPeers = metrics.NewRegisteredMeter("p2p/dials/error/saturated", nil) dialAlreadyConnected = metrics.NewRegisteredMeter("p2p/dials/error/known", nil) diff --git a/p2p/peer.go b/p2p/peer.go index f751173a0a..d66b24da50 100644 --- a/p2p/peer.go +++ b/p2p/peer.go @@ -254,6 +254,8 @@ func (p *Peer) run() (remoteRequested bool, err error) { p.wg.Add(2) go p.readLoop(readErr) go p.pingLoop() + live1min := time.NewTimer(1 * time.Minute) + defer live1min.Stop() // Start all protocol handlers. writeStart <- struct{}{} @@ -285,6 +287,12 @@ loop: case err = <-p.disc: reason = discReasonForError(err) break loop + case <-live1min.C: + if p.Inbound() { + serve1MinSuccessMeter.Mark(1) + } else { + dial1MinSuccessMeter.Mark(1) + } } }