@@ -31,6 +31,23 @@ const (
3131
3232 poolsAPIEndpoint = "https://pools.fx.land/pools/"
3333 serverKuboPeerIDCachePath = "/internal/.tmp/pool_%s_server_kubo.tmp"
34+
35+ // Static relay PeerID — matches Peering.Peers / Swarm.RelayClient.StaticRelays
36+ // in the kubo template config. The watchdog gates fula restarts on actual
37+ // relay reachability so we don't restart-loop while internet is genuinely down.
38+ staticRelayPeerID = "12D3KooWDRrBaAfPwsGJivBoUw5fE7ZpDiyfUjqgiURq2DEcL835"
39+
40+ // Layer-2 fail-safe thresholds (silent when Layer 1's kubo
41+ // Internal.Libp2pForceReachability=private keeps /p2p-circuit advertised).
42+ relayCircuitMissingThreshold = 2 // 2 × healthCheckInterval ≈ 60s
43+ minTimeBetweenFulaRestarts = 5 * time .Minute // cooldown to prevent restart loops
44+
45+ // Marker file the host's commands.sh watches; mounted from /home/pi/commands
46+ // (docker-compose.yml: /home/pi/:/home:rw,rshared on the go-fula service).
47+ // The host handler runs `docker compose restart` (not just kubo) because
48+ // ipfs-cluster's init-time peering registrations on kubo are lost on a
49+ // kubo-only restart and only re-applied when cluster's init re-runs.
50+ fulaRestartCommandPath = "/home/commands/.command_restart_fula"
3451)
3552
3653type p2pProtocol struct {
@@ -264,6 +281,15 @@ func waitForKuboAndRegister(ctx context.Context, kuboAPI string) error {
264281
265282// watchKuboP2P periodically checks that kubo p2p listeners are active.
266283// If kubo restarts, re-registers all protocols and the cluster forward.
284+ //
285+ // Also runs a Layer-2 fail-safe: if kubo's Identify Addresses lacks a
286+ // /p2p-circuit address for relayCircuitMissingThreshold consecutive cycles
287+ // while the static relay is still reachable, signal the host to restart
288+ // the fula stack (compose restart, not just kubo, so ipfs-cluster's init
289+ // re-runs and re-registers its peering/forward setup against the fresh
290+ // kubo). When Layer 1 (Internal.Libp2pForceReachability=private in the
291+ // kubo config) is in effect this path is silent because /p2p-circuit
292+ // stays continuously advertised.
267293func (p * Blox ) watchKuboP2P (ctx context.Context ) {
268294 kuboAPI := p .kuboAPIAddr
269295 if kuboAPI == "" {
@@ -273,11 +299,15 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
273299 ticker := time .NewTicker (healthCheckInterval )
274300 defer ticker .Stop ()
275301
302+ var consecutiveCircuitMissing int
303+ var lastRestartAt time.Time
304+
276305 for {
277306 select {
278307 case <- ticker .C :
279308 if ! checkKuboAlive (kuboAPI ) {
280309 log .Warn ("Kubo not responding, will re-register when available" )
310+ consecutiveCircuitMissing = 0
281311 continue
282312 }
283313 if ! checkKuboP2PListeners (kuboAPI ) {
@@ -302,6 +332,35 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
302332 }
303333 }
304334 }
335+
336+ if checkKuboHasCircuitAddress (kuboAPI ) {
337+ consecutiveCircuitMissing = 0
338+ continue
339+ }
340+ consecutiveCircuitMissing ++
341+ log .Warnw ("Kubo Identify is missing /p2p-circuit address" ,
342+ "consecutive" , consecutiveCircuitMissing ,
343+ "threshold" , relayCircuitMissingThreshold )
344+ if consecutiveCircuitMissing < relayCircuitMissingThreshold {
345+ continue
346+ }
347+ if ! checkRelayPeerConnected (kuboAPI , staticRelayPeerID ) {
348+ log .Debug ("Relay peer not connected; deferring fula restart until connectivity returns" )
349+ consecutiveCircuitMissing = 0
350+ continue
351+ }
352+ if time .Since (lastRestartAt ) < minTimeBetweenFulaRestarts {
353+ log .Infow ("Skipping fula restart — cooldown not elapsed" ,
354+ "elapsed" , time .Since (lastRestartAt ))
355+ continue
356+ }
357+ log .Warn ("Kubo has no circuit reservation; signaling host to restart fula stack" )
358+ if err := signalFulaRestart (); err != nil {
359+ log .Errorw ("Failed to signal fula restart" , "err" , err )
360+ continue
361+ }
362+ lastRestartAt = time .Now ()
363+ consecutiveCircuitMissing = 0
305364 case <- ctx .Done ():
306365 return
307366 }
@@ -428,3 +487,89 @@ func checkClusterForward(kuboAPI string) bool {
428487 }
429488 return false
430489}
490+
491+ // checkKuboHasCircuitAddress returns true if kubo's /api/v0/id Addresses list
492+ // contains a /p2p-circuit address. That's the only address reachable from peers
493+ // behind NAT (which is everyone, including the mobile app), so its absence
494+ // means the device is unreachable over the internet — the bug this watchdog
495+ // catches if Layer 1 (Internal.Libp2pForceReachability=private) doesn't apply.
496+ func checkKuboHasCircuitAddress (kuboAPI string ) bool {
497+ url := fmt .Sprintf ("http://%s/api/v0/id" , kuboAPI )
498+ resp , err := http .Post (url , "" , nil )
499+ if err != nil {
500+ return false
501+ }
502+ defer resp .Body .Close ()
503+ if resp .StatusCode != http .StatusOK {
504+ return false
505+ }
506+ var result struct {
507+ Addresses []string `json:"Addresses"`
508+ }
509+ if err := json .NewDecoder (resp .Body ).Decode (& result ); err != nil {
510+ return false
511+ }
512+ for _ , a := range result .Addresses {
513+ if strings .Contains (a , "/p2p-circuit/" ) {
514+ return true
515+ }
516+ }
517+ return false
518+ }
519+
520+ // checkRelayPeerConnected returns true if the static relay's PeerID appears in
521+ // kubo's swarm peers list. Gates kubo bounces on actual relay reachability: if
522+ // internet is down, bouncing won't acquire a reservation either, so we wait
523+ // until connectivity returns instead of restart-looping.
524+ func checkRelayPeerConnected (kuboAPI , relayPeerID string ) bool {
525+ url := fmt .Sprintf ("http://%s/api/v0/swarm/peers" , kuboAPI )
526+ resp , err := http .Post (url , "" , nil )
527+ if err != nil {
528+ return false
529+ }
530+ defer resp .Body .Close ()
531+ if resp .StatusCode != http .StatusOK {
532+ return false
533+ }
534+ var result struct {
535+ Peers []struct {
536+ Peer string `json:"Peer"`
537+ } `json:"Peers"`
538+ }
539+ if err := json .NewDecoder (resp .Body ).Decode (& result ); err != nil {
540+ return false
541+ }
542+ for _ , p := range result .Peers {
543+ if p .Peer == relayPeerID {
544+ return true
545+ }
546+ }
547+ return false
548+ }
549+
550+ // signalFulaRestart drops a marker file the host's commands.sh watches via
551+ // inotify. The host handler runs `docker compose restart` for the whole fula
552+ // stack, not just kubo, because ipfs-cluster's init-time setup on kubo
553+ // (peering and p2p forward registrations) is lost on a kubo-only restart.
554+ // Removes any stale marker first so the create event fires reliably even if
555+ // commands.sh hasn't yet processed a previous file.
556+ func signalFulaRestart () error {
557+ if err := os .MkdirAll (filepath .Dir (fulaRestartCommandPath ), 0755 ); err != nil {
558+ return fmt .Errorf ("mkdir commands dir: %w" , err )
559+ }
560+ _ = os .Remove (fulaRestartCommandPath )
561+ f , err := os .Create (fulaRestartCommandPath )
562+ if err != nil {
563+ return fmt .Errorf ("create command file: %w" , err )
564+ }
565+ return f .Close ()
566+ }
567+
568+ // cleanupStaleRestartMarker removes any leftover marker file from a previous
569+ // process. Runs once at startup so a stale file never triggers an unwanted
570+ // restart on first commands.sh wake-up.
571+ func cleanupStaleRestartMarker () {
572+ if err := os .Remove (fulaRestartCommandPath ); err != nil && ! os .IsNotExist (err ) {
573+ log .Warnw ("Failed to clean stale fula-restart marker" , "err" , err )
574+ }
575+ }
0 commit comments