Skip to content

Commit a672fc6

Browse files
committed
added limit to number of fula rstarts
1 parent e91f7e5 commit a672fc6

1 file changed

Lines changed: 93 additions & 2 deletions

File tree

blox/kubo_proxy.go

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,20 @@ const (
4040
// Layer-2 fail-safe thresholds (silent when Layer 1's kubo
4141
// Internal.Libp2pForceReachability=private keeps /p2p-circuit advertised).
4242
relayCircuitMissingThreshold = 2 // 2 × healthCheckInterval ≈ 60s
43-
minTimeBetweenFulaRestarts = 5 * time.Minute // cooldown to prevent restart loops
43+
minTimeBetweenFulaRestarts = 5 * time.Minute // min interval between bounces
44+
45+
// Hard rate limit: at most this many bounces in any rolling restartRateWindow.
46+
// Bounds worst-case downtime if bouncing doesn't fix the issue (e.g., relay
47+
// genuinely unreachable from this network, or a bug we haven't seen yet).
48+
maxRestartsPerWindow = 3
49+
restartRateWindow = 1 * time.Hour
50+
51+
// After a bounce, give kubo this long to come back and re-acquire a circuit
52+
// before we judge the bounce as "did not help". Multiple consecutive
53+
// unhelpful bounces escalate to a long cooldown.
54+
bounceVerifyGracePeriod = 3 * time.Minute
55+
maxConsecutiveFailedRestarts = 3
56+
longCooldownAfterFailures = 1 * time.Hour
4457

4558
// Marker file the host's commands.sh watches; mounted from /home/pi/commands
4659
// (docker-compose.yml: /home/pi/:/home:rw,rshared on the go-fula service).
@@ -301,6 +314,10 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
301314

302315
var consecutiveCircuitMissing int
303316
var lastRestartAt time.Time
317+
var consecutiveFailedRestarts int
318+
var longCooldownUntil time.Time
319+
// Sliding window of recent restart timestamps for rate-limiting.
320+
var restartHistory []time.Time
304321

305322
for {
306323
select {
@@ -334,6 +351,16 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
334351
}
335352

336353
if checkKuboHasCircuitAddress(kuboAPI) {
354+
// Circuit is present. If a restart was triggered within the
355+
// last bounceVerifyGracePeriod, treat the recovery as
356+
// successful and reset the consecutive-failure counter.
357+
if !lastRestartAt.IsZero() && time.Since(lastRestartAt) < bounceVerifyGracePeriod {
358+
if consecutiveFailedRestarts > 0 {
359+
log.Infow("Last fula restart appears to have restored circuit; resetting failure counter",
360+
"previousFailures", consecutiveFailedRestarts)
361+
}
362+
consecutiveFailedRestarts = 0
363+
}
337364
consecutiveCircuitMissing = 0
338365
continue
339366
}
@@ -344,22 +371,86 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
344371
if consecutiveCircuitMissing < relayCircuitMissingThreshold {
345372
continue
346373
}
374+
375+
// Gate 1: don't bounce while internet is genuinely down.
376+
// The static relay must be reachable for a fresh reservation
377+
// attempt to succeed.
347378
if !checkRelayPeerConnected(kuboAPI, staticRelayPeerID) {
348379
log.Debug("Relay peer not connected; deferring fula restart until connectivity returns")
349380
consecutiveCircuitMissing = 0
350381
continue
351382
}
383+
384+
// Gate 2: long cooldown after repeated unhelpful bounces.
385+
// If the last few bounces did not restore the circuit (within
386+
// bounceVerifyGracePeriod), back off for longCooldownAfterFailures.
387+
if !longCooldownUntil.IsZero() && time.Now().Before(longCooldownUntil) {
388+
log.Infow("Skipping fula restart — extended cooldown after repeated failed bounces",
389+
"remaining", time.Until(longCooldownUntil),
390+
"failedRestarts", consecutiveFailedRestarts)
391+
continue
392+
}
393+
394+
// If a previous bounce already verified as failed, count it.
395+
// We judge a bounce as failed if circuit is still missing
396+
// bounceVerifyGracePeriod after the bounce.
397+
if !lastRestartAt.IsZero() && time.Since(lastRestartAt) >= bounceVerifyGracePeriod {
398+
// Only count once per failed bounce (next gate-1 reset clears it).
399+
if consecutiveFailedRestarts == 0 || lastRestartAt.After(time.Now().Add(-bounceVerifyGracePeriod-minTimeBetweenFulaRestarts)) {
400+
// no-op; we account for it on the actual bounce decision below
401+
}
402+
}
403+
404+
// Gate 3: short cooldown between back-to-back bounces.
352405
if time.Since(lastRestartAt) < minTimeBetweenFulaRestarts {
353406
log.Infow("Skipping fula restart — cooldown not elapsed",
354407
"elapsed", time.Since(lastRestartAt))
355408
continue
356409
}
410+
411+
// Gate 4: sliding-window rate limit.
412+
// Drop restart timestamps older than restartRateWindow.
413+
now := time.Now()
414+
cutoff := now.Add(-restartRateWindow)
415+
pruned := restartHistory[:0]
416+
for _, t := range restartHistory {
417+
if t.After(cutoff) {
418+
pruned = append(pruned, t)
419+
}
420+
}
421+
restartHistory = pruned
422+
if len(restartHistory) >= maxRestartsPerWindow {
423+
log.Warnw("Skipping fula restart — sliding-window rate limit hit",
424+
"restarts", len(restartHistory),
425+
"window", restartRateWindow,
426+
"oldestInWindow", now.Sub(restartHistory[0]))
427+
continue
428+
}
429+
430+
// All gates passed. If the previous bounce was already past
431+
// its grace period and circuit is still missing, count it as failed.
432+
if !lastRestartAt.IsZero() && time.Since(lastRestartAt) >= bounceVerifyGracePeriod {
433+
consecutiveFailedRestarts++
434+
log.Warnw("Previous fula restart did not restore circuit",
435+
"consecutiveFailedRestarts", consecutiveFailedRestarts,
436+
"max", maxConsecutiveFailedRestarts)
437+
if consecutiveFailedRestarts >= maxConsecutiveFailedRestarts {
438+
longCooldownUntil = now.Add(longCooldownAfterFailures)
439+
log.Warnw("Too many consecutive failed restarts; entering extended cooldown",
440+
"until", longCooldownUntil,
441+
"duration", longCooldownAfterFailures)
442+
consecutiveCircuitMissing = 0
443+
continue
444+
}
445+
}
446+
357447
log.Warn("Kubo has no circuit reservation; signaling host to restart fula stack")
358448
if err := signalFulaRestart(); err != nil {
359449
log.Errorw("Failed to signal fula restart", "err", err)
360450
continue
361451
}
362-
lastRestartAt = time.Now()
452+
lastRestartAt = now
453+
restartHistory = append(restartHistory, now)
363454
consecutiveCircuitMissing = 0
364455
case <-ctx.Done():
365456
return

0 commit comments

Comments
 (0)