@@ -40,7 +40,20 @@ const (
4040 // Layer-2 fail-safe thresholds (silent when Layer 1's kubo
4141 // Internal.Libp2pForceReachability=private keeps /p2p-circuit advertised).
4242 relayCircuitMissingThreshold = 2 // 2 × healthCheckInterval ≈ 60s
43- minTimeBetweenFulaRestarts = 5 * time .Minute // cooldown to prevent restart loops
43+ minTimeBetweenFulaRestarts = 5 * time .Minute // min interval between bounces
44+
45+ // Hard rate limit: at most this many bounces in any rolling restartRateWindow.
46+ // Bounds worst-case downtime if bouncing doesn't fix the issue (e.g., relay
47+ // genuinely unreachable from this network, or a bug we haven't seen yet).
48+ maxRestartsPerWindow = 3
49+ restartRateWindow = 1 * time .Hour
50+
51+ // After a bounce, give kubo this long to come back and re-acquire a circuit
52+ // before we judge the bounce as "did not help". Multiple consecutive
53+ // unhelpful bounces escalate to a long cooldown.
54+ bounceVerifyGracePeriod = 3 * time .Minute
55+ maxConsecutiveFailedRestarts = 3
56+ longCooldownAfterFailures = 1 * time .Hour
4457
4558 // Marker file the host's commands.sh watches; mounted from /home/pi/commands
4659 // (docker-compose.yml: /home/pi/:/home:rw,rshared on the go-fula service).
@@ -301,6 +314,10 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
301314
302315 var consecutiveCircuitMissing int
303316 var lastRestartAt time.Time
317+ var consecutiveFailedRestarts int
318+ var longCooldownUntil time.Time
319+ // Sliding window of recent restart timestamps for rate-limiting.
320+ var restartHistory []time.Time
304321
305322 for {
306323 select {
@@ -334,6 +351,16 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
334351 }
335352
336353 if checkKuboHasCircuitAddress (kuboAPI ) {
354+ // Circuit is present. If a restart was triggered within the
355+ // last bounceVerifyGracePeriod, treat the recovery as
356+ // successful and reset the consecutive-failure counter.
357+ if ! lastRestartAt .IsZero () && time .Since (lastRestartAt ) < bounceVerifyGracePeriod {
358+ if consecutiveFailedRestarts > 0 {
359+ log .Infow ("Last fula restart appears to have restored circuit; resetting failure counter" ,
360+ "previousFailures" , consecutiveFailedRestarts )
361+ }
362+ consecutiveFailedRestarts = 0
363+ }
337364 consecutiveCircuitMissing = 0
338365 continue
339366 }
@@ -344,22 +371,86 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
344371 if consecutiveCircuitMissing < relayCircuitMissingThreshold {
345372 continue
346373 }
374+
375+ // Gate 1: don't bounce while internet is genuinely down.
376+ // The static relay must be reachable for a fresh reservation
377+ // attempt to succeed.
347378 if ! checkRelayPeerConnected (kuboAPI , staticRelayPeerID ) {
348379 log .Debug ("Relay peer not connected; deferring fula restart until connectivity returns" )
349380 consecutiveCircuitMissing = 0
350381 continue
351382 }
383+
384+ // Gate 2: long cooldown after repeated unhelpful bounces.
385+ // If the last few bounces did not restore the circuit (within
386+ // bounceVerifyGracePeriod), back off for longCooldownAfterFailures.
387+ if ! longCooldownUntil .IsZero () && time .Now ().Before (longCooldownUntil ) {
388+ log .Infow ("Skipping fula restart — extended cooldown after repeated failed bounces" ,
389+ "remaining" , time .Until (longCooldownUntil ),
390+ "failedRestarts" , consecutiveFailedRestarts )
391+ continue
392+ }
393+
394+ // If a previous bounce already verified as failed, count it.
395+ // We judge a bounce as failed if circuit is still missing
396+ // bounceVerifyGracePeriod after the bounce.
397+ if ! lastRestartAt .IsZero () && time .Since (lastRestartAt ) >= bounceVerifyGracePeriod {
398+ // Only count once per failed bounce (next gate-1 reset clears it).
399+ if consecutiveFailedRestarts == 0 || lastRestartAt .After (time .Now ().Add (- bounceVerifyGracePeriod - minTimeBetweenFulaRestarts )) {
400+ // no-op; we account for it on the actual bounce decision below
401+ }
402+ }
403+
404+ // Gate 3: short cooldown between back-to-back bounces.
352405 if time .Since (lastRestartAt ) < minTimeBetweenFulaRestarts {
353406 log .Infow ("Skipping fula restart — cooldown not elapsed" ,
354407 "elapsed" , time .Since (lastRestartAt ))
355408 continue
356409 }
410+
411+ // Gate 4: sliding-window rate limit.
412+ // Drop restart timestamps older than restartRateWindow.
413+ now := time .Now ()
414+ cutoff := now .Add (- restartRateWindow )
415+ pruned := restartHistory [:0 ]
416+ for _ , t := range restartHistory {
417+ if t .After (cutoff ) {
418+ pruned = append (pruned , t )
419+ }
420+ }
421+ restartHistory = pruned
422+ if len (restartHistory ) >= maxRestartsPerWindow {
423+ log .Warnw ("Skipping fula restart — sliding-window rate limit hit" ,
424+ "restarts" , len (restartHistory ),
425+ "window" , restartRateWindow ,
426+ "oldestInWindow" , now .Sub (restartHistory [0 ]))
427+ continue
428+ }
429+
430+ // All gates passed. If the previous bounce was already past
431+ // its grace period and circuit is still missing, count it as failed.
432+ if ! lastRestartAt .IsZero () && time .Since (lastRestartAt ) >= bounceVerifyGracePeriod {
433+ consecutiveFailedRestarts ++
434+ log .Warnw ("Previous fula restart did not restore circuit" ,
435+ "consecutiveFailedRestarts" , consecutiveFailedRestarts ,
436+ "max" , maxConsecutiveFailedRestarts )
437+ if consecutiveFailedRestarts >= maxConsecutiveFailedRestarts {
438+ longCooldownUntil = now .Add (longCooldownAfterFailures )
439+ log .Warnw ("Too many consecutive failed restarts; entering extended cooldown" ,
440+ "until" , longCooldownUntil ,
441+ "duration" , longCooldownAfterFailures )
442+ consecutiveCircuitMissing = 0
443+ continue
444+ }
445+ }
446+
357447 log .Warn ("Kubo has no circuit reservation; signaling host to restart fula stack" )
358448 if err := signalFulaRestart (); err != nil {
359449 log .Errorw ("Failed to signal fula restart" , "err" , err )
360450 continue
361451 }
362- lastRestartAt = time .Now ()
452+ lastRestartAt = now
453+ restartHistory = append (restartHistory , now )
363454 consecutiveCircuitMissing = 0
364455 case <- ctx .Done ():
365456 return
0 commit comments