mudler
diff --git a/‎core/services/nodes/probe_cache.go‎
Lines changed: 94 additions & 0 deletions b/‎core/services/nodes/probe_cache.go‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎core/services/nodes/probe_cache_test.go‎
Lines changed: 145 additions & 0 deletions b/‎core/services/nodes/probe_cache_test.go‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎core/services/nodes/registry.go‎
Lines changed: 24 additions & 13 deletions b/‎core/services/nodes/registry.go‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎core/services/nodes/registry_test.go‎
Lines changed: 74 additions & 0 deletions b/‎core/services/nodes/registry_test.go‎
Lines changed: 74 additions & 0 deletions
@@ -0,0 +1,94 @@
+package nodes
+
+import (
+	"sync"
+	"time"
+
+	"golang.org/x/sync/singleflight"
+)
+
+// probeCache memoizes recent successful gRPC HealthCheck results for
+// (nodeID, addr) tuples so SmartRouter.probeHealth doesn't pay a round-trip
+// on every inference request.
+//
+// Why this exists: with per-request routing (see pkg/model/loader.go), every
+// inference call goes through SmartRouter.Route, which probes the backend
+// before returning a client. Many gRPC backends (notably llama.cpp's server)
+// serialize HealthCheck against active Predict on a shared goroutine, so a
+// burst of new requests can stall behind a single long-running stream —
+// exactly the "queue stalling" symptom observed in distributed clusters.
+//
+// The background HealthMonitor (perModelHealthCheck) is still the cluster-wide
+// source of truth that reaps actually-dead backends within ~45s; this cache
+// only saves the per-request hot path from re-asking when nothing has changed.
+//
+// TTL matches healthCheckTTL in pkg/model/model.go so the single-process
+// IsRecentlyHealthy path and this distributed-mode path share the same
+// staleness budget.
+type probeCache struct {
+	ttl    time.Duration
+	mu     sync.Mutex
+	seen   map[string]time.Time // key → last successful probe
+	flight singleflight.Group   // coalesces concurrent probes for the same key
+}
+
+// newProbeCache returns a probeCache with the given TTL. Zero TTL disables
+// caching: every call to DoOrCached invokes the probe.
+func newProbeCache(ttl time.Duration) *probeCache {
+	return &probeCache{
+		ttl:  ttl,
+		seen: make(map[string]time.Time),
+	}
+}
+
+// IsFresh reports whether key was successfully probed within TTL.
+func (c *probeCache) IsFresh(key string) bool {
+	if c.ttl <= 0 {
+		return false
+	}
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	last, ok := c.seen[key]
+	return ok && time.Since(last) < c.ttl
+}
+
+// markFresh records key as successfully probed at the current time.
+func (c *probeCache) markFresh(key string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.seen[key] = time.Now()
+}
+
+// Invalidate drops any cached freshness for key. Used after a probe failure
+// (or any other signal that the backend may not be alive) so the next call
+// will re-probe instead of trusting stale state.
+func (c *probeCache) Invalidate(key string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	delete(c.seen, key)
+}
+
+// DoOrCached returns true if key is fresh; otherwise it runs probe (coalescing
+// concurrent callers via singleflight) and caches a successful result. Failed
+// probes invalidate the cache, so a transient miss doesn't pin every
+// subsequent request to a re-probe.
+func (c *probeCache) DoOrCached(key string, probe func() bool) bool {
+	if c.IsFresh(key) {
+		return true
+	}
+	v, _, _ := c.flight.Do(key, func() (any, error) {
+		// Double-check after potentially waiting: another caller in this
+		// flight may have just populated the cache.
+		if c.IsFresh(key) {
+			return true, nil
+		}
+		ok := probe()
+		if ok {
+			c.markFresh(key)
+		} else {
+			c.Invalidate(key)
+		}
+		return ok, nil
+	})
+	return v.(bool)
+}
@@ -0,0 +1,145 @@
+package nodes
+
+import (
+	"sync"
+	"sync/atomic"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("probeCache", func() {
+	It("invokes the probe on a cold cache and caches success", func() {
+		c := newProbeCache(time.Minute)
+		var calls int32
+		probe := func() bool {
+			atomic.AddInt32(&calls, 1)
+			return true
+		}
+
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+
+		// Cached: probe ran once.
+		Expect(atomic.LoadInt32(&calls)).To(Equal(int32(1)))
+	})
+
+	It("re-probes after the TTL expires", func() {
+		// 1 ms TTL means the second call is virtually guaranteed to see an
+		// expired entry without flaking on scheduler jitter.
+		c := newProbeCache(time.Millisecond)
+		var calls int32
+		probe := func() bool {
+			atomic.AddInt32(&calls, 1)
+			return true
+		}
+
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		time.Sleep(5 * time.Millisecond)
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+
+		Expect(atomic.LoadInt32(&calls)).To(Equal(int32(2)))
+	})
+
+	It("does not cache failed probes — next call re-probes", func() {
+		c := newProbeCache(time.Minute)
+		var calls int32
+		var result atomic.Bool
+		probe := func() bool {
+			atomic.AddInt32(&calls, 1)
+			return result.Load()
+		}
+
+		// First probe fails — must NOT be cached.
+		result.Store(false)
+		Expect(c.DoOrCached("k", probe)).To(BeFalse())
+		Expect(c.IsFresh("k")).To(BeFalse())
+
+		// Recover: second probe succeeds and is cached.
+		result.Store(true)
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		Expect(c.IsFresh("k")).To(BeTrue())
+
+		// Third call short-circuits on the fresh entry.
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		Expect(atomic.LoadInt32(&calls)).To(Equal(int32(2)))
+	})
+
+	It("coalesces concurrent probes via singleflight", func() {
+		// Models the "6 chat completions arrive simultaneously for a
+		// not-yet-cached backend" scenario. Without singleflight every caller
+		// would dial the backend, defeating the purpose of the cache.
+		c := newProbeCache(time.Minute)
+		var calls int32
+		start := make(chan struct{})
+		probe := func() bool {
+			atomic.AddInt32(&calls, 1)
+			// Stall briefly so the test reliably has all goroutines parked
+			// inside flight.Do at the same time.
+			time.Sleep(50 * time.Millisecond)
+			return true
+		}
+
+		const N = 8
+		var wg sync.WaitGroup
+		results := make([]bool, N)
+		for i := 0; i < N; i++ {
+			wg.Add(1)
+			go func(i int) {
+				defer wg.Done()
+				<-start
+				results[i] = c.DoOrCached("k", probe)
+			}(i)
+		}
+
+		close(start)
+		wg.Wait()
+
+		Expect(atomic.LoadInt32(&calls)).To(Equal(int32(1)),
+			"singleflight must collapse %d concurrent probes into one", N)
+		for i, got := range results {
+			Expect(got).To(BeTrue(), "goroutine %d saw a different result", i)
+		}
+	})
+
+	It("treats different keys independently", func() {
+		c := newProbeCache(time.Minute)
+		var aCalls, bCalls int32
+		Expect(c.DoOrCached("a", func() bool { atomic.AddInt32(&aCalls, 1); return true })).To(BeTrue())
+		Expect(c.DoOrCached("b", func() bool { atomic.AddInt32(&bCalls, 1); return true })).To(BeTrue())
+		Expect(c.DoOrCached("a", func() bool { atomic.AddInt32(&aCalls, 1); return true })).To(BeTrue())
+
+		Expect(atomic.LoadInt32(&aCalls)).To(Equal(int32(1)))
+		Expect(atomic.LoadInt32(&bCalls)).To(Equal(int32(1)))
+	})
+
+	It("disables caching when TTL is zero", func() {
+		c := newProbeCache(0)
+		var calls int32
+		probe := func() bool {
+			atomic.AddInt32(&calls, 1)
+			return true
+		}
+
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+
+		Expect(atomic.LoadInt32(&calls)).To(Equal(int32(3)))
+	})
+
+	It("Invalidate forces the next call to re-probe", func() {
+		c := newProbeCache(time.Hour)
+		var calls int32
+		probe := func() bool {
+			atomic.AddInt32(&calls, 1)
+			return true
+		}
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		c.Invalidate("k")
+		Expect(c.DoOrCached("k", probe)).To(BeTrue())
+		Expect(atomic.LoadInt32(&calls)).To(Equal(int32(2)))
+	})
+})
@@ -668,10 +668,21 @@ func (r *NodeRegistry) FindNodesWithModel(ctx context.Context, modelName string)
 	return nodes, nil
 }
 
-// FindAndLockNodeWithModel atomically finds the least-loaded node with the given
-// model loaded and increments its in-flight counter within a single transaction.
-// The SELECT FOR UPDATE row lock prevents concurrent eviction from removing the
-// NodeModel row between the find and increment operations.
+// FindAndLockNodeWithModel atomically finds the best loaded replica of the
+// given model and increments its in-flight counter within a single
+// transaction. The SELECT FOR UPDATE row lock prevents concurrent eviction
+// from removing the NodeModel row between the find and increment operations,
+// and serializes contending routers so concurrent picks distribute across
+// replicas instead of all landing on the same row.
+//
+// **Policy:** the SQL ORDER BY below MUST mirror PickBestReplica
+// (replicapicker.go). PickBestReplica is the canonical Go implementation of
+// the same rule — the per-frontend rotating-replica cache (TODO, see
+// pkg/model/loader.go) will eventually use it against in-memory snapshots so
+// hot inference requests don't pay this DB round-trip. If you change the
+// ordering here, change both sides; the TestFindAndLockNodeWithModelMirror
+// spec ("agrees with PickBestReplica on a seeded dataset") fails fast if they
+// drift.
 //
 // When candidateNodeIDs is non-empty, only nodes in that set are considered.
 // Pass nil (or empty) to consider any node. This lets callers pre-filter by
@@ -683,16 +694,16 @@ func (r *NodeRegistry) FindAndLockNodeWithModel(ctx context.Context, modelName s
 	var node BackendNode
 
 	err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
-		// Order by in_flight ASC (least busy replica), then by last_used ASC
-		// (round-robin between equally-loaded replicas — oldest used wins, and
-		// every successful pick refreshes last_used below, so the "oldest" naturally
-		// rotates through the candidate set). available_vram DESC is the final
-		// tiebreaker for cold starts where last_used is identical.
+		// Mirror of PickBestReplica's policy (see replicapicker.go):
+		//   1. in_flight ASC — least busy replica.
+		//   2. last_used ASC — round-robin between equally-loaded replicas.
+		//      Every successful pick refreshes last_used below, so the
+		//      "oldest" tier naturally rotates through the candidate set.
+		//      Without this tier, in_flight ties collapsed to "fattest GPU
+		//      wins every time" and one node took nearly all the load.
+		//   3. available_vram DESC — final tiebreaker for cold starts where
+		//      last_used is identical across replicas.
 		//
-		// Without the last_used tier, a tie on in_flight (the common case at low
-		// to moderate concurrency where requests don't overlap) collapses to
-		// "biggest GPU wins every time" and one node ends up taking nearly all
-		// the load while replicas on other nodes sit idle.
 		// Filter on backend_nodes.status = healthy in the inner JOIN itself,
 		// not only in the later node-fetch step. The previous version picked
 		// a (node_id, replica) pair purely on node_models state, then bailed
 
@@ -3,6 +3,7 @@ package nodes
 import (
 	"context"
 	"runtime"
+	"time"
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -357,6 +358,79 @@ var _ = Describe("NodeRegistry", func() {
 			_, _, err := registry.FindAndLockNodeWithModel(context.Background(), "no-match-model", []string{emptyIncluded.ID})
 			Expect(err).To(HaveOccurred())
 		})
+
+		It("agrees with PickBestReplica on a seeded dataset (policy mirror)", func() {
+			// Guard against drift between the SQL ORDER BY in
+			// FindAndLockNodeWithModel and the canonical Go implementation in
+			// PickBestReplica. The two layers will eventually diverge in
+			// caller (DB-backed atomic pick vs in-memory snapshot pick for the
+			// per-frontend rotating cache), but the policy itself must stay
+			// the single source of truth. If this test fails, update *both*
+			// sides — never just one.
+			//
+			// Scenario exercises all three tiers:
+			//   - "loser-busy" has the most VRAM but in_flight=2 — loses tier 1.
+			//   - "loser-recent" ties at in_flight=0 but its last_used is the
+			//     newest of the in_flight=0 group — loses tier 2.
+			//   - "winner-mid" and "winner-fat" both tie at in_flight=0 and
+			//     share the oldest last_used — tier 3 decides: fattest wins.
+			loserBusy := makeNode("mirror-loser-busy", "10.0.0.70:50051", 32_000_000_000)
+			loserRecent := makeNode("mirror-loser-recent", "10.0.0.71:50051", 8_000_000_000)
+			winnerMid := makeNode("mirror-winner-mid", "10.0.0.72:50051", 16_000_000_000)
+			winnerFat := makeNode("mirror-winner-fat", "10.0.0.73:50051", 24_000_000_000)
+			for _, n := range []*BackendNode{loserBusy, loserRecent, winnerMid, winnerFat} {
+				Expect(registry.Register(context.Background(), n, true)).To(Succeed())
+				Expect(registry.SetNodeModel(context.Background(), n.ID, "mirror-model", 0, "loaded", "", 0)).To(Succeed())
+			}
+
+			// Force in_flight=2 on the "busy" node so tier 1 disqualifies it.
+			Expect(registry.IncrementInFlight(context.Background(), loserBusy.ID, "mirror-model", 0)).To(Succeed())
+			Expect(registry.IncrementInFlight(context.Background(), loserBusy.ID, "mirror-model", 0)).To(Succeed())
+
+			// Slam last_used to known values so the test is deterministic
+			// regardless of clock resolution between the helpers above.
+			base := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)
+			set := func(id string, t time.Time) {
+				Expect(db.Model(&NodeModel{}).
+					Where("node_id = ? AND model_name = ?", id, "mirror-model").
+					Update("last_used", t).Error).To(Succeed())
+			}
+			set(loserBusy.ID, base) // newest doesn't matter — already disqualified by tier 1
+			set(loserRecent.ID, base.Add(time.Hour))
+			set(winnerMid.ID, base)
+			set(winnerFat.ID, base)
+
+			// Pull the same dataset both pickers will operate on. The Go
+			// picker is a faithful representation of the policy; the SQL is
+			// the production path.
+			var rows []NodeModel
+			Expect(db.Where("model_name = ? AND state = ?", "mirror-model", "loaded").
+				Find(&rows).Error).To(Succeed())
+			candidates := make([]ReplicaCandidate, 0, len(rows))
+			for _, nm := range rows {
+				var bn BackendNode
+				Expect(db.First(&bn, "id = ? AND status = ?", nm.NodeID, StatusHealthy).Error).To(Succeed())
+				candidates = append(candidates, ReplicaCandidate{
+					NodeID:        nm.NodeID,
+					Address:       bn.Address,
+					ReplicaIndex:  nm.ReplicaIndex,
+					InFlight:      nm.InFlight,
+					LastUsed:      nm.LastUsed,
+					AvailableVRAM: bn.AvailableVRAM,
+				})
+			}
+			goPick := PickBestReplica(candidates)
+			Expect(goPick).ToNot(BeNil())
+
+			sqlNode, _, err := registry.FindAndLockNodeWithModel(context.Background(), "mirror-model", nil)
+			Expect(err).ToNot(HaveOccurred())
+
+			Expect(sqlNode.ID).To(Equal(goPick.NodeID),
+				"SQL ORDER BY picked %s; PickBestReplica picked %s — policy has drifted",
+				sqlNode.ID, goPick.NodeID)
+			// Sanity check: the policy says winner-fat wins on tier 3.
+			Expect(goPick.NodeID).To(Equal(winnerFat.ID))
+		})
 	})
 
 	Describe("MarkHealthy and MarkUnhealthy round-trip", func() {