Skip to content

Commit fda1c55

Browse files
authored
fix(distributed): stop queue loops on agent nodes + dead-letter cap (#9433)
pending_backend_ops rows targeting agent-type workers looped forever: the reconciler fan-out hit a NATS subject the worker doesn't subscribe to, returned ErrNoResponders, we marked the node unhealthy, and the health monitor flipped it back to healthy on the next heartbeat. Next tick, same row, same failure. Three related fixes: 1. enqueueAndDrainBackendOp skips nodes whose NodeType != backend. Agent workers handle agent NATS subjects, not backend.install / delete / list, so enqueueing for them guarantees an infinite retry loop. Silent skip is correct — they aren't consumers of these ops. 2. Reconciler drain mirrors enqueueAndDrainBackendOp's behavior on nats.ErrNoResponders: mark the node unhealthy before recording the failure, so subsequent ListDuePendingBackendOps (filters by status=healthy) stops picking the row until the node actually recovers. Matches the synchronous fan-out path. 3. Dead-letter cap at maxPendingBackendOpAttempts (10). After ~1h of exponential backoff the row is a poison message; further retries just thrash NATS. Row is deleted and logged at ERROR so it stays visible without staying infinite. Plus a one-shot startup cleanup in NewNodeRegistry: drop queue rows that target agent-type nodes, non-existent nodes, or carry an empty backend name. Guarded by the same schema-migration advisory lock so only one instance performs it. The guards above prevent new rows of this shape; this closes the migration gap for existing ones. Tests: the prune migration (valid row stays, agent + empty-name rows drop) on top of existing upsert / backoff coverage.
1 parent b27de08 commit fda1c55

4 files changed

Lines changed: 94 additions & 0 deletions

File tree

core/services/nodes/managers_distributed.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,13 @@ func (d *DistributedBackendManager) enqueueAndDrainBackendOp(ctx context.Context
106106
if node.Status == StatusPending {
107107
continue
108108
}
109+
// Backend lifecycle ops only make sense on backend-type workers.
110+
// Agent workers don't subscribe to backend.install/delete/list, so
111+
// enqueueing for them guarantees a forever-retrying row that the
112+
// reconciler can never drain. Silently skip — they aren't consumers.
113+
if node.NodeType != "" && node.NodeType != NodeTypeBackend {
114+
continue
115+
}
109116
if err := d.registry.UpsertPendingBackendOp(ctx, node.ID, backend, op, galleriesJSON); err != nil {
110117
xlog.Warn("Failed to enqueue backend op", "op", op, "node", node.Name, "backend", backend, "error", err)
111118
result.Nodes = append(result.Nodes, NodeOpStatus{

core/services/nodes/reconciler.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ package nodes
33
import (
44
"context"
55
"encoding/json"
6+
"errors"
67
"fmt"
78
"time"
89

910
"github.com/mudler/LocalAI/core/services/advisorylock"
1011
grpcclient "github.com/mudler/LocalAI/pkg/grpc"
1112
"github.com/mudler/xlog"
13+
"github.com/nats-io/nats.go"
1214
"gorm.io/gorm"
1315
)
1416

@@ -206,12 +208,47 @@ func (rc *ReplicaReconciler) drainPendingBackendOps(ctx context.Context) {
206208
}
207209
continue
208210
}
211+
212+
// ErrNoResponders means the node has no active NATS subscription for
213+
// this subject. Either its connection dropped, or it's the wrong
214+
// node type entirely. Mark unhealthy so the health monitor's
215+
// heartbeat-only pass doesn't immediately flip it back — and so
216+
// ListDuePendingBackendOps (which filters by status=healthy) stops
217+
// picking the row until the node genuinely recovers.
218+
if errors.Is(applyErr, nats.ErrNoResponders) {
219+
xlog.Warn("Reconciler: no NATS responders — marking node unhealthy",
220+
"op", op.Op, "backend", op.Backend, "node", op.NodeID)
221+
_ = rc.registry.MarkUnhealthy(ctx, op.NodeID)
222+
}
223+
224+
// Dead-letter cap: after maxAttempts the row is the reconciler
225+
// equivalent of a poison message. Delete it loudly so the queue
226+
// doesn't churn NATS every tick forever — operators can re-issue
227+
// the op from the UI if they still want it applied.
228+
if op.Attempts+1 >= maxPendingBackendOpAttempts {
229+
xlog.Error("Reconciler: abandoning pending backend op after max attempts",
230+
"op", op.Op, "backend", op.Backend, "node", op.NodeID,
231+
"attempts", op.Attempts+1, "last_error", applyErr)
232+
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
233+
xlog.Warn("Reconciler: failed to delete abandoned op row", "id", op.ID, "error", err)
234+
}
235+
continue
236+
}
237+
209238
_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
210239
xlog.Warn("Reconciler: pending backend op retry failed",
211240
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
212241
}
213242
}
214243

244+
// maxPendingBackendOpAttempts caps how many times the reconciler retries a
245+
// failing row before dead-lettering it. Ten attempts at exponential backoff
246+
// (30s → 15m cap) is >1h of wall-clock patience — well past any transient
247+
// worker restart or network blip. Poisoned rows beyond that are almost
248+
// certainly structural (wrong node type, non-existent gallery entry) and no
249+
// amount of further retrying will help.
250+
const maxPendingBackendOpAttempts = 10
251+
215252
// probeLoadedModels gRPC-health-checks model addresses that the DB says are
216253
// loaded. If a model's backend process is gone (OOM, crash, manual restart)
217254
// we remove the row so ghosts don't linger. Only probes rows older than

core/services/nodes/reconciler_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,4 +373,30 @@ var _ = Describe("ReplicaReconciler — state reconciliation", func() {
373373
Expect(row.NextRetryAt).To(BeTemporally(">", before))
374374
})
375375
})
376+
377+
Describe("NewNodeRegistry malformed-row pruning", func() {
378+
It("drops queue rows for agent nodes and non-existent nodes on startup", func() {
379+
agent := &BackendNode{Name: "agent-1", NodeType: NodeTypeAgent, Address: "x"}
380+
Expect(registry.Register(context.Background(), agent, true)).To(Succeed())
381+
backend := &BackendNode{Name: "backend-1", NodeType: NodeTypeBackend, Address: "y"}
382+
Expect(registry.Register(context.Background(), backend, true)).To(Succeed())
383+
384+
// Three rows: one for a valid backend node (should survive),
385+
// one for an agent node (pruned), one for an empty backend name
386+
// on the valid node (pruned).
387+
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "foo", OpBackendInstall, nil)).To(Succeed())
388+
Expect(registry.UpsertPendingBackendOp(context.Background(), agent.ID, "foo", OpBackendInstall, nil)).To(Succeed())
389+
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "", OpBackendInstall, nil)).To(Succeed())
390+
391+
// Re-instantiating the registry runs the cleanup migration.
392+
_, err := NewNodeRegistry(db)
393+
Expect(err).ToNot(HaveOccurred())
394+
395+
var rows []PendingBackendOp
396+
Expect(db.Find(&rows).Error).To(Succeed())
397+
Expect(rows).To(HaveLen(1))
398+
Expect(rows[0].NodeID).To(Equal(backend.ID))
399+
Expect(rows[0].Backend).To(Equal("foo"))
400+
})
401+
})
376402
})

core/services/nodes/registry.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,30 @@ func NewNodeRegistry(db *gorm.DB) (*NodeRegistry, error) {
148148
}); err != nil {
149149
return nil, fmt.Errorf("migrating node tables: %w", err)
150150
}
151+
152+
// One-shot cleanup of queue rows that can never drain: ops targeted at
153+
// agent workers (wrong subscription set), at non-existent nodes, or with
154+
// an empty backend name. The guard in enqueueAndDrainBackendOp prevents
155+
// new ones from being written, but rows persisted by earlier versions
156+
// keep the reconciler busy retrying a permanently-failing NATS request
157+
// every 30s. Guarded by the same migration advisory lock so only one
158+
// frontend runs it.
159+
_ = advisorylock.WithLockCtx(context.Background(), db, advisorylock.KeySchemaMigrate, func() error {
160+
res := db.Exec(`
161+
DELETE FROM pending_backend_ops
162+
WHERE backend = ''
163+
OR node_id NOT IN (SELECT id FROM backend_nodes WHERE node_type = ? OR node_type = '')
164+
`, NodeTypeBackend)
165+
if res.Error != nil {
166+
xlog.Warn("Failed to prune malformed pending_backend_ops rows", "error", res.Error)
167+
return res.Error
168+
}
169+
if res.RowsAffected > 0 {
170+
xlog.Info("Pruned pending_backend_ops rows (wrong node type or empty backend)", "count", res.RowsAffected)
171+
}
172+
return nil
173+
})
174+
151175
return &NodeRegistry{db: db}, nil
152176
}
153177

0 commit comments

Comments
 (0)