Skip to content

Commit 2497c9d

Browse files
authored
Merge pull request #28 from FluffyAIcode/AgentMemory/server-kv-gauge-idle-zero-8e7f
Gate scheduler_kv_live_bytes gauge on active session presence
2 parents 2af99b5 + 7e90cda commit 2497c9d

3 files changed

Lines changed: 97 additions & 18 deletions

File tree

inference_engine/server/app.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,12 +301,32 @@ async def metrics_endpoint() -> Response:
301301
# populates once PooledVerifier is wired (a post-v0.3.0
302302
# change) and otherwise reads 0 even while the verifier
303303
# cache is several MiB.
304+
#
305+
# Gauge semantics: "KV bytes attributable to in-flight
306+
# sessions". Between turns, the verifier's ``self.cache``
307+
# still holds the previous turn's tensors — the next
308+
# prefill calls ``reset()`` which replaces them, but until
309+
# then ``engine.kv_state()`` reports non-zero residual
310+
# bytes. Reporting that as "live" misleads observers
311+
# and breaks the §2.3 KV-bounded check (residual carries
312+
# forward at the previous turn's peak, never trimmed). We
313+
# therefore gate the gauge on ``active_count > 0``: an
314+
# idle server reports 0, a server with an active session
315+
# reports the verifier's true KV size. This is also how
316+
# the gauge will naturally behave once PooledVerifier is
317+
# wired post-v0.3 (the pool aggregation is 0 when no slab
318+
# is in use).
319+
kv_live = (
320+
int(engine_for_kv.kv_state())
321+
if scheduler.active_count > 0
322+
else 0
323+
)
304324
metrics.snapshot_scheduler(
305325
active=scheduler.active_count,
306326
pool_in_use=pool.in_use_count,
307327
pool_total=pool.total_count,
308328
pending=scheduler.pending_count,
309-
kv_live_bytes=int(engine_for_kv.kv_state()),
329+
kv_live_bytes=kv_live,
310330
)
311331
return PlainTextResponse(
312332
content=metrics.render(),

inference_engine/server/metrics.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,10 +172,13 @@ def build(cls) -> "Metrics":
172172
),
173173
scheduler_kv_live_bytes=Gauge(
174174
"scheduler_kv_live_bytes",
175-
"Bytes of KV cache currently live across all active "
176-
"sessions. Bounded by the per-session sink+window "
177-
"configuration; verifies the ADR 0006 §2.3 long-session "
178-
"memory-stability claim.",
175+
"Bytes of KV cache attributable to in-flight sessions. "
176+
"Reads 0 when no session is active (the verifier may "
177+
"still hold residual cache between turns, but that "
178+
"carry-over is reset on the next prefill, so it does "
179+
"not count as 'live' usage). Verifies the ADR 0006 §2.3 "
180+
"long-session memory-stability claim: bounded by the "
181+
"per-session sink+window configuration.",
179182
registry=registry,
180183
),
181184
scheduler_admission_total=Counter(

tests/inference_engine/server/test_app_metrics_and_auth.py

Lines changed: 69 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -105,22 +105,30 @@ async def test_metrics_kv_live_bytes_gauge_present_and_zero_at_idle(
105105
assert "scheduler_kv_live_bytes 0.0" in text
106106

107107

108-
async def test_metrics_kv_live_bytes_reflects_engine_kv_state(tokenizer):
108+
async def test_metrics_kv_live_bytes_reads_from_engine_during_active_session(
109+
tokenizer,
110+
):
109111
"""The /metrics handler must read KV bytes from the engine on
110-
every scrape (not from the pool). This is the v0.3 wiring that
111-
makes bench_long_session.py's in-flight scrape produce a
112-
non-zero number on real hardware — without it the gauge
113-
unconditionally reads 0 because no production code path sets
114-
the slab's live_kv_bytes_override.
112+
every scrape during an in-flight session.
113+
114+
This is the v0.3 wiring that makes bench_long_session.py's
115+
in-flight scrape produce a non-zero number on real hardware —
116+
without it the gauge unconditionally reads 0 because no
117+
production code path sets the slab's live_kv_bytes_override.
115118
116119
The 2026-05-30 short test #2 (results/.../bench_long_session_mac_short2_
117120
1780196477.json) recorded 7313 in-flight samples across 58 turns
118121
with pool_in_use=1 throughout, yet kv_live_bytes was 0.0 in every
119-
sample. This regression test pins the fix.
122+
sample. This regression test pins the fix end-to-end through real
123+
ASGI: spawn an in-flight chat-completion in a Task, race a /metrics
124+
scrape against it, assert the scrape sees the engine's kv_state.
120125
"""
121126
from tests.inference_engine.server.conftest import DeterministicEngine
122127

123-
class _KVAwareEngine(DeterministicEngine):
128+
class _KVAwareSlowEngine(DeterministicEngine):
129+
"""KV-reporting engine that pauses each token long enough for
130+
a /metrics scrape to race the chat-completion task."""
131+
124132
def __init__(self, *args, kv_value: int, **kwargs):
125133
super().__init__(*args, **kwargs)
126134
self._kv_value = kv_value
@@ -130,22 +138,70 @@ def kv_state(self) -> int:
130138

131139
eos = tokenizer.eos_token_id
132140
assert eos is not None
133-
hello = tokenizer._intern("hi")
134-
eng = _KVAwareEngine(
135-
fixed_tokens=[hello, eos],
141+
ids = [tokenizer._intern(f"tok{i}") for i in range(20)]
142+
eng = _KVAwareSlowEngine(
143+
fixed_tokens=ids + [eos],
136144
tokenizer=tokenizer,
137-
model_id_label="kv-aware",
145+
model_id_label="kv-aware-slow",
146+
per_token_delay_s=0.05,
138147
kv_value=12345678,
139148
)
140149
app = create_app(eng, ServerConfig(max_concurrent=1))
141150
async with AsyncClient(transport=ASGITransport(app=app),
142-
base_url="http://t") as c:
151+
base_url="http://t", timeout=30.0) as c:
152+
post_task = asyncio.create_task(c.post(
153+
"/v1/chat/completions",
154+
json={"model": "m",
155+
"messages": [{"role": "user", "content": "hi"}],
156+
"max_tokens": 20},
157+
))
158+
# Let the scheduler admit and the worker start
159+
await asyncio.sleep(0.1)
143160
r = await c.get("/metrics")
161+
await post_task
144162
assert r.status_code == 200
145163
assert "scheduler_kv_live_bytes 1.2345678e+07" in r.text or \
146164
"scheduler_kv_live_bytes 12345678" in r.text
147165

148166

167+
async def test_metrics_kv_live_bytes_zero_when_no_active_session(tokenizer):
168+
"""Between turns the verifier may hold residual KV (next prefill
169+
will reset it, but until then it sits in self.cache). Reporting
170+
that as 'live' breaks observability and breaks the §2.3 KV-bounded
171+
check — the residual would carry forward at the previous turn's
172+
peak forever. The gauge must therefore gate on
173+
``scheduler.active_count > 0``: idle scrape reads 0 even if
174+
engine.kv_state() is non-zero.
175+
"""
176+
from tests.inference_engine.server.conftest import DeterministicEngine
177+
178+
class _AlwaysHoldingEngine(DeterministicEngine):
179+
"""Engine whose verifier permanently holds 8 MiB of cache —
180+
simulates the post-turn residual state where the verifier has
181+
not yet been reset by a follow-up prefill."""
182+
183+
def kv_state(self) -> int:
184+
return 8 * 1024 * 1024
185+
186+
eos = tokenizer.eos_token_id
187+
assert eos is not None
188+
hello = tokenizer._intern("hi")
189+
eng = _AlwaysHoldingEngine(
190+
fixed_tokens=[hello, eos], tokenizer=tokenizer,
191+
model_id_label="residual-holder",
192+
)
193+
app = create_app(eng, ServerConfig(max_concurrent=1))
194+
async with AsyncClient(transport=ASGITransport(app=app),
195+
base_url="http://t") as c:
196+
# No in-flight request → active_count == 0 → gauge gated to 0
197+
r = await c.get("/metrics")
198+
assert r.status_code == 200
199+
assert "scheduler_kv_live_bytes 0.0" in r.text
200+
# Crucially, the engine's residual is NOT exposed on the gauge:
201+
assert "scheduler_kv_live_bytes 8388608" not in r.text
202+
assert "scheduler_kv_live_bytes 8.388608e+06" not in r.text
203+
204+
149205
# ---------------------------------------------------------------------------
150206
# OpenAI error envelope
151207
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)