Skip to content

Commit 26ca0dc

Browse files
authored
feat(server): prefix-aware inline prefix-cache eviction (#452)
The inline prefix cache evicted by pure LRU, which can drop a short, broadly-shared ancestor prefix (e.g. system prompt plus first turn, reused by many later branches) while keeping a long, conversation-specific leaf snapshot that nothing else reuses. The next branch that needs the shared ancestor then re-prefills it. Make eviction prefix-aware: prefer evicting the oldest leaf (an entry whose tokens are not a strict prefix of any other live entry) so shared ancestors stay resident. Falls back to plain LRU when no ancestor structure exists. This is not strictly better than LRU. It keeps the frozen shallow ancestors plus the current deepest entry, while LRU keeps the N most-recent entries. It wins when later branches reuse an early shared root (the agentic system-prompt pattern) and can lose when a branch reuses a recent but non-current prefix that LRU would still hold. Linear conversations are unaffected: both keep the deepest entry. The policy is a pure free function select_inline_evict_victim over the cached prefixes in LRU order; inline entries now carry their prefix tokens so the leaf test can run. Contained to prefix_cache; no backend or request-protocol change. Adds model-free unit tests for the policy.
1 parent cd8b065 commit 26ca0dc

3 files changed

Lines changed: 105 additions & 8 deletions

File tree

server/src/server/prefix_cache.cpp

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,38 @@ PrefixHash hash_prefix(const int32_t * ids, int count) {
140140
return h;
141141
}
142142

143+
// ─── Prefix-aware eviction ──────────────────────────────────────────────
144+
145+
static bool is_strict_prefix(const std::vector<int32_t> & a,
146+
const std::vector<int32_t> & b) {
147+
// True iff `a` is a strict (shorter) prefix of `b`.
148+
if (a.size() >= b.size()) return false;
149+
return std::equal(a.begin(), a.end(), b.begin());
150+
}
151+
152+
int select_inline_evict_victim(const std::vector<const std::vector<int32_t> *> & ids_lru) {
153+
const int n = (int)ids_lru.size();
154+
if (n <= 0) return 0;
155+
// Oldest-first scan: evict the first entry that is not a strict prefix of any
156+
// other entry (a leaf). Shared ancestor prefixes are thereby kept resident.
157+
for (int i = 0; i < n; i++) {
158+
bool is_ancestor = false;
159+
for (int j = 0; j < n; j++) {
160+
if (j == i) continue;
161+
if (is_strict_prefix(*ids_lru[i], *ids_lru[j])) { is_ancestor = true; break; }
162+
}
163+
if (!is_ancestor) return i; // oldest leaf
164+
}
165+
return 0; // unreachable (the longest entry is always a leaf); pure-LRU fallback
166+
}
167+
168+
int select_inline_evict_victim(const std::vector<std::vector<int32_t>> & ids_lru) {
169+
std::vector<const std::vector<int32_t> *> ptrs;
170+
ptrs.reserve(ids_lru.size());
171+
for (const auto & v : ids_lru) ptrs.push_back(&v);
172+
return select_inline_evict_victim(ptrs);
173+
}
174+
143175
// ─── PrefixCache ────────────────────────────────────────────────────────
144176

145177
PrefixCache::PrefixCache(int cap, const Tokenizer & tokenizer)
@@ -171,9 +203,9 @@ int PrefixCache::find_entry(const PrefixHash & h) const {
171203

172204
void PrefixCache::move_to_end(int idx) {
173205
if (idx < 0 || idx >= (int)entries_.size()) return;
174-
auto e = entries_[idx];
206+
auto e = std::move(entries_[idx]);
175207
entries_.erase(entries_.begin() + idx);
176-
entries_.push_back(e);
208+
entries_.push_back(std::move(e));
177209
}
178210

179211
int PrefixCache::find_full_entry(const PrefixHash & h) const {
@@ -235,10 +267,22 @@ std::pair<int, int> PrefixCache::prepare_inline_snap(
235267

236268
int slot;
237269
if ((int)entries_.size() >= cap_) {
238-
// At capacity — reserve the LRU slot without evicting yet.
239-
pending_evict_key_ = entries_.front().hash;
270+
// At capacity — reserve a slot without evicting yet. Prefix-aware: prefer
271+
// the oldest leaf so shared ancestor prefixes (reused by later branches)
272+
// stay resident. entries_ is already in LRU order (front = oldest).
273+
std::vector<const std::vector<int32_t> *> ids_lru;
274+
ids_lru.reserve(entries_.size());
275+
for (const auto & e : entries_) ids_lru.push_back(&e.ids);
276+
int victim = select_inline_evict_victim(ids_lru);
277+
pending_evict_key_ = entries_[victim].hash;
240278
has_pending_evict_ = true;
241-
slot = entries_.front().slot;
279+
slot = entries_[victim].slot;
280+
if (victim != 0) {
281+
std::fprintf(stderr,
282+
"[pc] prefix-aware evict: victim idx=%d (len=%zu) kept oldest "
283+
"ancestor (len=%zu)\n",
284+
victim, entries_[victim].ids.size(), entries_.front().ids.size());
285+
}
242286
} else {
243287
slot = next_slot_;
244288
next_slot_ = (next_slot_ + 1) % cap_;
@@ -278,7 +322,8 @@ void PrefixCache::confirm_inline_snap(int slot, int target_cut,
278322
}
279323

280324
auto key = hash_prefix(prompt_ids.data(), target_cut);
281-
entries_.push_back({key, slot});
325+
std::vector<int32_t> ids(prompt_ids.begin(), prompt_ids.begin() + target_cut);
326+
entries_.push_back({key, slot, std::move(ids)});
282327
entries_size_count_.fetch_add(1, std::memory_order_relaxed);
283328
std::fprintf(stderr, "[pc] inline-snap committed slot=%d prefix_len=%d\n",
284329
slot, target_cut);

server/src/server/prefix_cache.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ std::vector<int> find_all_boundaries(const std::vector<int32_t> & ids,
4545
using PrefixHash = std::array<uint8_t, 16>;
4646
PrefixHash hash_prefix(const int32_t * ids, int count);
4747

48+
// Prefix-aware inline eviction policy. Given the cached prefixes in LRU order
49+
// (index 0 = oldest), return the index of the eviction victim: the oldest entry
50+
// whose ids are NOT a strict prefix of any other entry's ids (a "leaf"). Keeping
51+
// shared ancestor prefixes resident avoids re-prefilling them for later branches.
52+
// Returns 0 (pure-LRU fallback) when ids_lru is empty or, impossibly, no leaf
53+
// is found. Pure and model-free so it can be unit-tested without a PrefixCache.
54+
// The pointer overload is the core (the caller passes pointers into its own
55+
// entries so no token vectors are copied); the value overload is a convenience
56+
// wrapper for tests.
57+
int select_inline_evict_victim(const std::vector<const std::vector<int32_t> *> & ids_lru);
58+
int select_inline_evict_victim(const std::vector<std::vector<int32_t>> & ids_lru);
59+
4860
// ─── Prefix cache entry ─────────────────────────────────────────────────
4961

5062
struct FullCacheEntry {
@@ -139,8 +151,9 @@ class PrefixCache {
139151
// LRU for inline prefix cache: ordered map of hash → slot.
140152
// We use a vector to maintain insertion order (front = oldest).
141153
struct LruEntry {
142-
PrefixHash hash;
143-
int slot;
154+
PrefixHash hash;
155+
int slot;
156+
std::vector<int32_t> ids; // prefix tokens [0, target_cut) for prefix-aware eviction
144157
};
145158
std::vector<LruEntry> entries_;
146159
int next_slot_ = 0;

server/test/test_server_unit.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,40 @@ static void test_find_boundaries_empty() {
12421242
TEST_ASSERT(bounds.empty());
12431243
}
12441244

1245+
// ── Prefix-aware eviction policy (model-free) ───────────────────────────
1246+
1247+
static void test_evict_empty_is_zero() {
1248+
std::vector<std::vector<int32_t>> ids;
1249+
TEST_ASSERT(select_inline_evict_victim(ids) == 0);
1250+
}
1251+
1252+
static void test_evict_single_is_zero() {
1253+
std::vector<std::vector<int32_t>> ids = {{1, 2, 3}};
1254+
TEST_ASSERT(select_inline_evict_victim(ids) == 0);
1255+
}
1256+
1257+
static void test_evict_chain_keeps_ancestors() {
1258+
// Oldest-first chain: [s] < [s,a] < [s,a,b]. Only the longest is a leaf, so
1259+
// the short shared ancestors are kept and the victim is the deepest entry.
1260+
std::vector<std::vector<int32_t>> ids = {{9}, {9, 1}, {9, 1, 2}};
1261+
TEST_ASSERT(select_inline_evict_victim(ids) == 2);
1262+
}
1263+
1264+
static void test_evict_unrelated_falls_back_to_lru() {
1265+
// No prefix relation: all are leaves, so evict the oldest (index 0).
1266+
std::vector<std::vector<int32_t>> ids = {{1, 1}, {2, 2}, {3, 3}};
1267+
TEST_ASSERT(select_inline_evict_victim(ids) == 0);
1268+
}
1269+
1270+
static void test_evict_branch_spares_shared_root() {
1271+
// [s] is an ancestor of both branches, so it is never the victim; the oldest
1272+
// leaf ([s,a] at index 1) is evicted instead.
1273+
std::vector<std::vector<int32_t>> ids = {{9}, {9, 1}, {9, 2}};
1274+
int v = select_inline_evict_victim(ids);
1275+
TEST_ASSERT(v == 1);
1276+
TEST_ASSERT(v != 0); // the shared root must be spared
1277+
}
1278+
12451279
// ═══════════════════════════════════════════════════════════════════════
12461280
// PFlash config tests (model-free)
12471281
// ═══════════════════════════════════════════════════════════════════════
@@ -4054,6 +4088,11 @@ int main() {
40544088
RUN_TEST(test_hash_prefix_different_lengths);
40554089
RUN_TEST(test_hash_prefix_empty);
40564090
RUN_TEST(test_find_boundaries_empty);
4091+
RUN_TEST(test_evict_empty_is_zero);
4092+
RUN_TEST(test_evict_single_is_zero);
4093+
RUN_TEST(test_evict_chain_keeps_ancestors);
4094+
RUN_TEST(test_evict_unrelated_falls_back_to_lru);
4095+
RUN_TEST(test_evict_branch_spares_shared_root);
40574096

40584097
std::fprintf(stderr, "\n── PFlash config ──\n");
40594098
RUN_TEST(test_pflash_config_defaults);

0 commit comments

Comments
 (0)