Skip to content

Commit d04ce46

Browse files
Feat/detailed coordination window diagnostics (#3861)
# Detailed Coordination Window Diagnostics ## Summary This PR introduces comprehensive diagnostics and metrics tracking for tBTC coordination windows, significantly enhancing observability into the coordination process. The changes add detailed per-window and per-wallet metrics, improve network diagnostics, and expand performance monitoring capabilities. A new comprehensive metrics tracking system for coordination windows that provides: - **Per-Window Tracking**: Each coordination window is tracked with: - Window identification (index, coordination block) - Timing information (start time, end time, duration, block ranges) - Coordination statistics (wallets coordinated, successful/failed counts) - Leader distribution across wallets - Action type breakdown - Fault statistics (by type and culprit) - **Per-Wallet Details**: For each wallet coordinated in a window: - Wallet public key hash - Leader address - Action type - Success/failure status - Duration - Error messages (if failed) - Detailed fault information - **Memory Management**: Tracks up to 100 recent windows (~25 hours) with automatic cleanup of older windows to prevent unbounded memory growth
2 parents e0d801f + d8323e0 commit d04ce46

13 files changed

Lines changed: 852 additions & 80 deletions

config/peers_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,8 @@ func TestResolvePeers(t *testing.T) {
2424
"sepolia network": {
2525
network: network.Testnet,
2626
expectedPeers: []string{
27-
"/dns4/bootstrap-0.test.keep.network/tcp/3919/ipfs/16Uiu2HAmCcfVpHwfBKNFbQuhvGuFXHVLQ65gB4sJm7HyrcZuLttH",
28-
"/dns4/bootstrap-1.test.keep.network/tcp/3919/ipfs/16Uiu2HAm3eJtyFKAttzJ85NLMromHuRg4yyum3CREMf6CHBBV6KY",
2927
"/dns4/bst-a01.test.keep.boar.network/tcp/6001/ipfs/16Uiu2HAmSLDSahiKyTbCNNu8wJmZAsiKF7wuYJ8mogY8ZuAG1jhu",
28+
"/dns4/keep-validator-0.eks-ap-northeast-2-secure.staging.staked.cloud/tcp/3919/ipfs/16Uiu2HAm77eSvRq5ioD4J8VFPkq3bJHBEHkssCuiFkgAoABwjo2S",
3029
},
3130
},
3231
"developer network": {

pkg/clientinfo/performance.go

Lines changed: 102 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ import (
77
"runtime"
88
"sync"
99
"time"
10+
11+
// gopsutil provides cross-platform system and process utilities.
12+
// It supports linux/amd64 and darwin/amd64 (the target platforms for this codebase),
13+
// as well as Windows, FreeBSD, OpenBSD, and Solaris.
14+
"github.com/shirou/gopsutil/cpu"
15+
"github.com/shirou/gopsutil/mem"
1016
)
1117

1218
// PerformanceMetricsRecorder provides a simple interface for recording
@@ -107,27 +113,40 @@ func (pm *PerformanceMetrics) registerAllMetrics() {
107113
MetricSigningSuccessTotal,
108114
MetricSigningFailedTotal,
109115
MetricSigningTimeoutsTotal,
116+
MetricRedemptionExecutionsTotal,
117+
MetricRedemptionExecutionsSuccessTotal,
118+
MetricRedemptionExecutionsFailedTotal,
119+
MetricRedemptionProofSubmissionsTotal,
120+
MetricRedemptionProofSubmissionsSuccessTotal,
121+
MetricRedemptionProofSubmissionsFailedTotal,
110122
MetricWalletActionsTotal,
111123
MetricWalletActionSuccessTotal,
112124
MetricWalletActionFailedTotal,
113125
MetricWalletHeartbeatFailuresTotal,
114126
MetricCoordinationWindowsDetectedTotal,
115127
MetricCoordinationProceduresExecutedTotal,
116128
MetricCoordinationFailedTotal,
129+
MetricCoordinationLeaderTimeoutTotal,
117130
MetricPeerConnectionsTotal,
118131
MetricPeerDisconnectionsTotal,
119132
MetricMessageBroadcastTotal,
120133
MetricMessageReceivedTotal,
121134
MetricPingTestsTotal,
122135
MetricPingTestSuccessTotal,
123136
MetricPingTestFailedTotal,
137+
MetricNetworkJoinRequestsTotal,
138+
MetricNetworkJoinRequestsSuccessTotal,
139+
MetricNetworkJoinRequestsFailedTotal,
140+
MetricFirewallRejectionsTotal,
124141
MetricWalletDispatcherRejectedTotal,
125142
}
126143

127144
// First, initialize all counters in the map
145+
pm.countersMutex.Lock()
128146
for _, name := range counters {
129147
pm.counters[name] = &counter{value: 0}
130148
}
149+
pm.countersMutex.Unlock()
131150

132151
// Then, register observers (this prevents concurrent map read/write)
133152
for _, name := range counters {
@@ -159,7 +178,9 @@ func (pm *PerformanceMetrics) registerAllMetrics() {
159178
WalletActionMetricName(actionType, "failed_total"),
160179
}
161180
for _, name := range actionCounters {
181+
pm.countersMutex.Lock()
162182
pm.counters[name] = &counter{value: 0}
183+
pm.countersMutex.Unlock()
163184
metricName := name // Capture for closure
164185
pm.registry.ObserveApplicationSource(
165186
"performance",
@@ -181,9 +202,11 @@ func (pm *PerformanceMetrics) registerAllMetrics() {
181202

182203
// Register duration metric for this action type
183204
durationName := WalletActionMetricName(actionType, "duration_seconds")
205+
pm.histogramsMutex.Lock()
184206
pm.histograms[durationName] = &histogram{
185207
buckets: make(map[float64]float64),
186208
}
209+
pm.histogramsMutex.Unlock()
187210
durationMetricName := durationName // Capture for closure
188211
pm.registry.ObserveApplicationSource(
189212
"performance",
@@ -212,17 +235,22 @@ func (pm *PerformanceMetrics) registerAllMetrics() {
212235
durationMetrics := []string{
213236
MetricDKGDurationSeconds,
214237
MetricSigningDurationSeconds,
238+
MetricRedemptionActionDurationSeconds,
215239
MetricWalletActionDurationSeconds,
216240
MetricCoordinationDurationSeconds,
241+
MetricCoordinationWindowDurationSeconds,
217242
MetricPingTestDurationSeconds,
243+
MetricNetworkHandshakeDurationSeconds,
218244
}
219245

220246
// First, initialize all histograms in the map
247+
pm.histogramsMutex.Lock()
221248
for _, name := range durationMetrics {
222249
pm.histograms[name] = &histogram{
223250
buckets: make(map[float64]float64),
224251
}
225252
}
253+
pm.histogramsMutex.Unlock()
226254

227255
// Then, register observers (this prevents concurrent map read/write)
228256
for _, name := range durationMetrics {
@@ -270,12 +298,17 @@ func (pm *PerformanceMetrics) registerAllMetrics() {
270298
MetricCPUUtilization,
271299
MetricMemoryUsageMB,
272300
MetricGoroutineCount,
301+
MetricCPULoadPercent,
302+
MetricRAMUtilizationPercent,
303+
MetricSwapUtilizationPercent,
273304
}
274305

275306
// First, initialize all gauges in the map
307+
pm.gaugesMutex.Lock()
276308
for _, name := range gauges {
277309
pm.gauges[name] = &gauge{value: 0}
278310
}
311+
pm.gaugesMutex.Unlock()
279312

280313
// Then, register observers (this prevents concurrent map read/write)
281314
for _, name := range gauges {
@@ -394,7 +427,7 @@ func (pm *PerformanceMetrics) SetGauge(name string, value float64) {
394427
// observeSystemMetrics periodically collects and updates system metrics
395428
// including CPU utilization, memory usage, and goroutine count.
396429
func (pm *PerformanceMetrics) observeSystemMetrics(ctx context.Context) {
397-
ticker := time.NewTicker(10 * time.Second) // Update every 10 seconds
430+
ticker := time.NewTicker(60 * time.Second) // Update every 10 seconds
398431
defer ticker.Stop()
399432

400433
var lastMemStats runtime.MemStats
@@ -428,6 +461,9 @@ func (pm *PerformanceMetrics) observeSystemMetrics(ctx context.Context) {
428461
lastMemStats = memStats
429462
lastUpdateTime = now
430463
}
464+
465+
// Update OS-level machine stats
466+
pm.updateMachineStats()
431467
case <-ctx.Done():
432468
return
433469
}
@@ -483,6 +519,37 @@ func (pm *PerformanceMetrics) calculateCPUUtilizationHeuristic(
483519
return cpuUtilization
484520
}
485521

522+
// updateMachineStats collects and updates OS-level machine statistics
523+
// including CPU load, RAM utilization, and swapfile utilization.
524+
func (pm *PerformanceMetrics) updateMachineStats() {
525+
// Get CPU load percentage (1-second average)
526+
// NOTE: cpu.Percent blocks for the specified duration (1 second) to sample
527+
// CPU usage over that interval. This blocking behavior is intentional and
528+
// necessary to obtain an accurate CPU utilization measurement. The function
529+
// will not return until the 1-second sampling period completes.
530+
cpuPercent, err := cpu.Percent(time.Second, false)
531+
if err == nil && len(cpuPercent) > 0 {
532+
pm.SetGauge(MetricCPULoadPercent, cpuPercent[0])
533+
}
534+
535+
// Get memory statistics
536+
memInfo, err := mem.VirtualMemory()
537+
if err == nil {
538+
// RAM utilization percentage
539+
pm.SetGauge(MetricRAMUtilizationPercent, memInfo.UsedPercent)
540+
541+
// Swap utilization percentage
542+
swapInfo, err := mem.SwapMemory()
543+
if err == nil && swapInfo.Total > 0 {
544+
swapUtilizationPercent := (float64(swapInfo.Used) / float64(swapInfo.Total)) * 100.0
545+
pm.SetGauge(MetricSwapUtilizationPercent, swapUtilizationPercent)
546+
} else {
547+
// If swap is not available or has no total, set to 0
548+
pm.SetGauge(MetricSwapUtilizationPercent, 0)
549+
}
550+
}
551+
}
552+
486553
// NoOpPerformanceMetrics is a no-op implementation of PerformanceMetricsRecorder
487554
// that can be used when metrics are disabled.
488555
type NoOpPerformanceMetrics struct{}
@@ -550,6 +617,17 @@ const (
550617
MetricSigningAttemptsPerOperation = "signing_attempts_per_operation"
551618
MetricSigningTimeoutsTotal = "signing_timeouts_total"
552619

620+
// Redemption Metrics
621+
MetricRedemptionExecutionsTotal = "redemption_executions_total"
622+
MetricRedemptionExecutionsSuccessTotal = "redemption_executions_success_total"
623+
MetricRedemptionExecutionsFailedTotal = "redemption_executions_failed_total"
624+
MetricRedemptionActionDurationSeconds = "redemption_action_duration_seconds"
625+
626+
// Redemption Proof Submission Metrics (SPV maintainer)
627+
MetricRedemptionProofSubmissionsTotal = "redemption_proof_submissions_total"
628+
MetricRedemptionProofSubmissionsSuccessTotal = "redemption_proof_submissions_success_total"
629+
MetricRedemptionProofSubmissionsFailedTotal = "redemption_proof_submissions_failed_total"
630+
553631
// Wallet Action Metrics (aggregate)
554632
MetricWalletActionsTotal = "wallet_actions_total"
555633
MetricWalletActionSuccessTotal = "wallet_action_success_total"
@@ -565,9 +643,18 @@ const (
565643
// Coordination Metrics
566644
MetricCoordinationWindowsDetectedTotal = "coordination_windows_detected_total"
567645
MetricCoordinationProceduresExecutedTotal = "coordination_procedures_executed_total"
568-
MetricCoordinationFailedTotal = "coordination_failed_total"
646+
MetricCoordinationFailedTotal = "coordination_failed_total" // Only when node is leader
647+
MetricCoordinationLeaderTimeoutTotal = "coordination_leader_timeout_total" // When follower observes leader timeout
569648
MetricCoordinationDurationSeconds = "coordination_duration_seconds"
570649

650+
// Coordination Window Metrics (per-window tracking)
651+
MetricCoordinationWindowDurationSeconds = "coordination_window_duration_seconds"
652+
MetricCoordinationWindowWalletsCoordinated = "coordination_window_wallets_coordinated"
653+
MetricCoordinationWindowWalletsSuccessful = "coordination_window_wallets_successful"
654+
MetricCoordinationWindowWalletsFailed = "coordination_window_wallets_failed"
655+
MetricCoordinationWindowTotalFaults = "coordination_window_total_faults"
656+
MetricCoordinationWindowCoordinationBlock = "coordination_window_coordination_block"
657+
571658
// Network Metrics
572659
MetricIncomingMessageQueueSize = "incoming_message_queue_size"
573660
MetricMessageHandlerQueueSize = "message_handler_queue_size"
@@ -580,14 +667,24 @@ const (
580667
MetricPingTestFailedTotal = "ping_test_failed_total"
581668
MetricPingTestDurationSeconds = "ping_test_duration_seconds"
582669

670+
// Network Join Request Metrics (inbound connection attempts from peers)
671+
MetricNetworkJoinRequestsTotal = "network_join_requests_total" // Total inbound join attempts
672+
MetricNetworkJoinRequestsSuccessTotal = "network_join_requests_success_total" // Successful joins
673+
MetricNetworkJoinRequestsFailedTotal = "network_join_requests_failed_total" // Failed joins (handshake failure)
674+
MetricNetworkHandshakeDurationSeconds = "network_handshake_duration_seconds" // Handshake duration
675+
MetricFirewallRejectionsTotal = "firewall_rejections_total" // Firewall rejections
676+
583677
// Wallet Dispatcher Metrics
584678
MetricWalletDispatcherActiveActions = "wallet_dispatcher_active_actions"
585679
MetricWalletDispatcherRejectedTotal = "wallet_dispatcher_rejected_total"
586680

587681
// System Metrics
588-
MetricCPUUtilization = "cpu_utilization_percent"
589-
MetricMemoryUsageMB = "memory_usage_mb"
590-
MetricGoroutineCount = "goroutine_count"
682+
MetricCPUUtilization = "cpu_utilization_percent"
683+
MetricMemoryUsageMB = "memory_usage_mb"
684+
MetricGoroutineCount = "goroutine_count"
685+
MetricCPULoadPercent = "cpu_load_percent"
686+
MetricRAMUtilizationPercent = "ram_utilization_percent"
687+
MetricSwapUtilizationPercent = "swap_utilization_percent"
591688
)
592689

593690
// WalletActionMetricName generates a metric name for a specific wallet action type.

pkg/clientinfo/performance_test.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,19 +64,40 @@ func TestConcurrentCounterDifferentMetrics(t *testing.T) {
6464
MetricSigningFailedTotal,
6565
}
6666

67+
// Add timeout to prevent test from hanging indefinitely
68+
testCtx, testCancel := context.WithTimeout(context.Background(), 30*time.Second)
69+
defer testCancel()
70+
71+
done := make(chan struct{})
6772
var wg sync.WaitGroup
6873
for _, metricName := range metrics {
6974
for i := 0; i < numGoroutines; i++ {
7075
wg.Add(1)
7176
go func(name string) {
7277
defer wg.Done()
7378
for j := 0; j < incrementsPer; j++ {
74-
pm.IncrementCounter(name, 1)
79+
select {
80+
case <-testCtx.Done():
81+
return
82+
default:
83+
pm.IncrementCounter(name, 1)
84+
}
7585
}
7686
}(metricName)
7787
}
7888
}
79-
wg.Wait()
89+
90+
go func() {
91+
wg.Wait()
92+
close(done)
93+
}()
94+
95+
select {
96+
case <-done:
97+
// Test completed successfully
98+
case <-testCtx.Done():
99+
t.Fatal("Test timed out waiting for goroutines to complete")
100+
}
80101

81102
expected := float64(numGoroutines * incrementsPer)
82103
for _, metricName := range metrics {
@@ -320,6 +341,9 @@ func TestMetricsInitialization(t *testing.T) {
320341
MetricCPUUtilization,
321342
MetricMemoryUsageMB,
322343
MetricGoroutineCount,
344+
MetricCPULoadPercent,
345+
MetricRAMUtilizationPercent,
346+
MetricSwapUtilizationPercent,
323347
}
324348

325349
for _, gaugeName := range gauges {

pkg/maintainer/spv/redemptions.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66

77
"github.com/keep-network/keep-core/pkg/bitcoin"
8+
"github.com/keep-network/keep-core/pkg/clientinfo"
89
"github.com/keep-network/keep-core/pkg/tbtc"
910
)
1011

@@ -46,12 +47,12 @@ func submitRedemptionProof(
4647
) error {
4748
// Record proof submission attempt
4849
if metricsRecorder != nil {
49-
metricsRecorder.IncrementCounter("redemption_proof_submissions_total", 1)
50+
metricsRecorder.IncrementCounter(clientinfo.MetricRedemptionProofSubmissionsTotal, 1)
5051
}
5152

5253
if requiredConfirmations == 0 {
5354
if metricsRecorder != nil {
54-
metricsRecorder.IncrementCounter("redemption_proof_submissions_failed_total", 1)
55+
metricsRecorder.IncrementCounter(clientinfo.MetricRedemptionProofSubmissionsFailedTotal, 1)
5556
}
5657
return fmt.Errorf(
5758
"provided required confirmations count must be greater than 0",
@@ -65,7 +66,7 @@ func submitRedemptionProof(
6566
)
6667
if err != nil {
6768
if metricsRecorder != nil {
68-
metricsRecorder.IncrementCounter("redemption_proof_submissions_failed_total", 1)
69+
metricsRecorder.IncrementCounter(clientinfo.MetricRedemptionProofSubmissionsFailedTotal, 1)
6970
}
7071
return fmt.Errorf(
7172
"failed to assemble transaction spv proof: [%v]",
@@ -79,7 +80,7 @@ func submitRedemptionProof(
7980
)
8081
if err != nil {
8182
if metricsRecorder != nil {
82-
metricsRecorder.IncrementCounter("redemption_proof_submissions_failed_total", 1)
83+
metricsRecorder.IncrementCounter(clientinfo.MetricRedemptionProofSubmissionsFailedTotal, 1)
8384
}
8485
return fmt.Errorf(
8586
"error while parsing transaction inputs: [%v]",
@@ -94,7 +95,7 @@ func submitRedemptionProof(
9495
walletPublicKeyHash,
9596
); err != nil {
9697
if metricsRecorder != nil {
97-
metricsRecorder.IncrementCounter("redemption_proof_submissions_failed_total", 1)
98+
metricsRecorder.IncrementCounter(clientinfo.MetricRedemptionProofSubmissionsFailedTotal, 1)
9899
}
99100
return fmt.Errorf(
100101
"failed to submit redemption proof with reimbursement: [%v]",
@@ -104,7 +105,7 @@ func submitRedemptionProof(
104105

105106
// Record successful proof submission
106107
if metricsRecorder != nil {
107-
metricsRecorder.IncrementCounter("redemption_proof_submissions_success_total", 1)
108+
metricsRecorder.IncrementCounter(clientinfo.MetricRedemptionProofSubmissionsSuccessTotal, 1)
108109
}
109110

110111
return nil

0 commit comments

Comments
 (0)