Skip to content

Commit 70194a0

Browse files
committed
resolve hte bug that kubo could not connenct after a short internet outage
1 parent e54af91 commit 70194a0

2 files changed

Lines changed: 149 additions & 0 deletions

File tree

blox/blox.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,10 @@ func (p *Blox) Start(ctx context.Context) error {
407407
return err
408408
}
409409

410+
// Clear any leftover restart-marker file from a previous process so it
411+
// doesn't trigger an unwanted fula restart on first commands.sh tick.
412+
cleanupStaleRestartMarker()
413+
410414
// Wait for kubo and register p2p protocols
411415
kuboAPI := getKuboAPIAddr(p.kuboAPIAddr)
412416
if err := waitForKuboAndRegister(ctx, kuboAPI); err != nil {

blox/kubo_proxy.go

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,23 @@ const (
3131

3232
poolsAPIEndpoint = "https://pools.fx.land/pools/"
3333
serverKuboPeerIDCachePath = "/internal/.tmp/pool_%s_server_kubo.tmp"
34+
35+
// Static relay PeerID — matches Peering.Peers / Swarm.RelayClient.StaticRelays
36+
// in the kubo template config. The watchdog gates fula restarts on actual
37+
// relay reachability so we don't restart-loop while internet is genuinely down.
38+
staticRelayPeerID = "12D3KooWDRrBaAfPwsGJivBoUw5fE7ZpDiyfUjqgiURq2DEcL835"
39+
40+
// Layer-2 fail-safe thresholds (silent when Layer 1's kubo
41+
// Internal.Libp2pForceReachability=private keeps /p2p-circuit advertised).
42+
relayCircuitMissingThreshold = 2 // 2 × healthCheckInterval ≈ 60s
43+
minTimeBetweenFulaRestarts = 5 * time.Minute // cooldown to prevent restart loops
44+
45+
// Marker file the host's commands.sh watches; mounted from /home/pi/commands
46+
// (docker-compose.yml: /home/pi/:/home:rw,rshared on the go-fula service).
47+
// The host handler runs `docker compose restart` (not just kubo) because
48+
// ipfs-cluster's init-time peering registrations on kubo are lost on a
49+
// kubo-only restart and only re-applied when cluster's init re-runs.
50+
fulaRestartCommandPath = "/home/commands/.command_restart_fula"
3451
)
3552

3653
type p2pProtocol struct {
@@ -264,6 +281,15 @@ func waitForKuboAndRegister(ctx context.Context, kuboAPI string) error {
264281

265282
// watchKuboP2P periodically checks that kubo p2p listeners are active.
266283
// If kubo restarts, re-registers all protocols and the cluster forward.
284+
//
285+
// Also runs a Layer-2 fail-safe: if kubo's Identify Addresses lacks a
286+
// /p2p-circuit address for relayCircuitMissingThreshold consecutive cycles
287+
// while the static relay is still reachable, signal the host to restart
288+
// the fula stack (compose restart, not just kubo, so ipfs-cluster's init
289+
// re-runs and re-registers its peering/forward setup against the fresh
290+
// kubo). When Layer 1 (Internal.Libp2pForceReachability=private in the
291+
// kubo config) is in effect this path is silent because /p2p-circuit
292+
// stays continuously advertised.
267293
func (p *Blox) watchKuboP2P(ctx context.Context) {
268294
kuboAPI := p.kuboAPIAddr
269295
if kuboAPI == "" {
@@ -273,11 +299,15 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
273299
ticker := time.NewTicker(healthCheckInterval)
274300
defer ticker.Stop()
275301

302+
var consecutiveCircuitMissing int
303+
var lastRestartAt time.Time
304+
276305
for {
277306
select {
278307
case <-ticker.C:
279308
if !checkKuboAlive(kuboAPI) {
280309
log.Warn("Kubo not responding, will re-register when available")
310+
consecutiveCircuitMissing = 0
281311
continue
282312
}
283313
if !checkKuboP2PListeners(kuboAPI) {
@@ -302,6 +332,35 @@ func (p *Blox) watchKuboP2P(ctx context.Context) {
302332
}
303333
}
304334
}
335+
336+
if checkKuboHasCircuitAddress(kuboAPI) {
337+
consecutiveCircuitMissing = 0
338+
continue
339+
}
340+
consecutiveCircuitMissing++
341+
log.Warnw("Kubo Identify is missing /p2p-circuit address",
342+
"consecutive", consecutiveCircuitMissing,
343+
"threshold", relayCircuitMissingThreshold)
344+
if consecutiveCircuitMissing < relayCircuitMissingThreshold {
345+
continue
346+
}
347+
if !checkRelayPeerConnected(kuboAPI, staticRelayPeerID) {
348+
log.Debug("Relay peer not connected; deferring fula restart until connectivity returns")
349+
consecutiveCircuitMissing = 0
350+
continue
351+
}
352+
if time.Since(lastRestartAt) < minTimeBetweenFulaRestarts {
353+
log.Infow("Skipping fula restart — cooldown not elapsed",
354+
"elapsed", time.Since(lastRestartAt))
355+
continue
356+
}
357+
log.Warn("Kubo has no circuit reservation; signaling host to restart fula stack")
358+
if err := signalFulaRestart(); err != nil {
359+
log.Errorw("Failed to signal fula restart", "err", err)
360+
continue
361+
}
362+
lastRestartAt = time.Now()
363+
consecutiveCircuitMissing = 0
305364
case <-ctx.Done():
306365
return
307366
}
@@ -428,3 +487,89 @@ func checkClusterForward(kuboAPI string) bool {
428487
}
429488
return false
430489
}
490+
491+
// checkKuboHasCircuitAddress returns true if kubo's /api/v0/id Addresses list
492+
// contains a /p2p-circuit address. That's the only address reachable from peers
493+
// behind NAT (which is everyone, including the mobile app), so its absence
494+
// means the device is unreachable over the internet — the bug this watchdog
495+
// catches if Layer 1 (Internal.Libp2pForceReachability=private) doesn't apply.
496+
func checkKuboHasCircuitAddress(kuboAPI string) bool {
497+
url := fmt.Sprintf("http://%s/api/v0/id", kuboAPI)
498+
resp, err := http.Post(url, "", nil)
499+
if err != nil {
500+
return false
501+
}
502+
defer resp.Body.Close()
503+
if resp.StatusCode != http.StatusOK {
504+
return false
505+
}
506+
var result struct {
507+
Addresses []string `json:"Addresses"`
508+
}
509+
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
510+
return false
511+
}
512+
for _, a := range result.Addresses {
513+
if strings.Contains(a, "/p2p-circuit/") {
514+
return true
515+
}
516+
}
517+
return false
518+
}
519+
520+
// checkRelayPeerConnected returns true if the static relay's PeerID appears in
521+
// kubo's swarm peers list. Gates kubo bounces on actual relay reachability: if
522+
// internet is down, bouncing won't acquire a reservation either, so we wait
523+
// until connectivity returns instead of restart-looping.
524+
func checkRelayPeerConnected(kuboAPI, relayPeerID string) bool {
525+
url := fmt.Sprintf("http://%s/api/v0/swarm/peers", kuboAPI)
526+
resp, err := http.Post(url, "", nil)
527+
if err != nil {
528+
return false
529+
}
530+
defer resp.Body.Close()
531+
if resp.StatusCode != http.StatusOK {
532+
return false
533+
}
534+
var result struct {
535+
Peers []struct {
536+
Peer string `json:"Peer"`
537+
} `json:"Peers"`
538+
}
539+
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
540+
return false
541+
}
542+
for _, p := range result.Peers {
543+
if p.Peer == relayPeerID {
544+
return true
545+
}
546+
}
547+
return false
548+
}
549+
550+
// signalFulaRestart drops a marker file the host's commands.sh watches via
551+
// inotify. The host handler runs `docker compose restart` for the whole fula
552+
// stack, not just kubo, because ipfs-cluster's init-time setup on kubo
553+
// (peering and p2p forward registrations) is lost on a kubo-only restart.
554+
// Removes any stale marker first so the create event fires reliably even if
555+
// commands.sh hasn't yet processed a previous file.
556+
func signalFulaRestart() error {
557+
if err := os.MkdirAll(filepath.Dir(fulaRestartCommandPath), 0755); err != nil {
558+
return fmt.Errorf("mkdir commands dir: %w", err)
559+
}
560+
_ = os.Remove(fulaRestartCommandPath)
561+
f, err := os.Create(fulaRestartCommandPath)
562+
if err != nil {
563+
return fmt.Errorf("create command file: %w", err)
564+
}
565+
return f.Close()
566+
}
567+
568+
// cleanupStaleRestartMarker removes any leftover marker file from a previous
569+
// process. Runs once at startup so a stale file never triggers an unwanted
570+
// restart on first commands.sh wake-up.
571+
func cleanupStaleRestartMarker() {
572+
if err := os.Remove(fulaRestartCommandPath); err != nil && !os.IsNotExist(err) {
573+
log.Warnw("Failed to clean stale fula-restart marker", "err", err)
574+
}
575+
}

0 commit comments

Comments
 (0)