@@ -105,22 +105,30 @@ async def test_metrics_kv_live_bytes_gauge_present_and_zero_at_idle(
105105 assert "scheduler_kv_live_bytes 0.0" in text
106106
107107
108- async def test_metrics_kv_live_bytes_reflects_engine_kv_state (tokenizer ):
108+ async def test_metrics_kv_live_bytes_reads_from_engine_during_active_session (
109+ tokenizer ,
110+ ):
109111 """The /metrics handler must read KV bytes from the engine on
110- every scrape (not from the pool). This is the v0.3 wiring that
111- makes bench_long_session.py's in-flight scrape produce a
112- non-zero number on real hardware — without it the gauge
113- unconditionally reads 0 because no production code path sets
114- the slab's live_kv_bytes_override.
112+ every scrape during an in-flight session.
113+
114+ This is the v0.3 wiring that makes bench_long_session.py's
115+ in-flight scrape produce a non-zero number on real hardware —
116+ without it the gauge unconditionally reads 0 because no
117+ production code path sets the slab's live_kv_bytes_override.
115118
116119 The 2026-05-30 short test #2 (results/.../bench_long_session_mac_short2_
117120 1780196477.json) recorded 7313 in-flight samples across 58 turns
118121 with pool_in_use=1 throughout, yet kv_live_bytes was 0.0 in every
119- sample. This regression test pins the fix.
122+ sample. This regression test pins the fix end-to-end through real
123+ ASGI: spawn an in-flight chat-completion in a Task, race a /metrics
124+ scrape against it, assert the scrape sees the engine's kv_state.
120125 """
121126 from tests .inference_engine .server .conftest import DeterministicEngine
122127
123- class _KVAwareEngine (DeterministicEngine ):
128+ class _KVAwareSlowEngine (DeterministicEngine ):
129+ """KV-reporting engine that pauses each token long enough for
130+ a /metrics scrape to race the chat-completion task."""
131+
124132 def __init__ (self , * args , kv_value : int , ** kwargs ):
125133 super ().__init__ (* args , ** kwargs )
126134 self ._kv_value = kv_value
@@ -130,22 +138,70 @@ def kv_state(self) -> int:
130138
131139 eos = tokenizer .eos_token_id
132140 assert eos is not None
133- hello = tokenizer ._intern ("hi" )
134- eng = _KVAwareEngine (
135- fixed_tokens = [ hello , eos ],
141+ ids = [ tokenizer ._intern (f"tok { i } " ) for i in range ( 20 )]
142+ eng = _KVAwareSlowEngine (
143+ fixed_tokens = ids + [ eos ],
136144 tokenizer = tokenizer ,
137- model_id_label = "kv-aware" ,
145+ model_id_label = "kv-aware-slow" ,
146+ per_token_delay_s = 0.05 ,
138147 kv_value = 12345678 ,
139148 )
140149 app = create_app (eng , ServerConfig (max_concurrent = 1 ))
141150 async with AsyncClient (transport = ASGITransport (app = app ),
142- base_url = "http://t" ) as c :
151+ base_url = "http://t" , timeout = 30.0 ) as c :
152+ post_task = asyncio .create_task (c .post (
153+ "/v1/chat/completions" ,
154+ json = {"model" : "m" ,
155+ "messages" : [{"role" : "user" , "content" : "hi" }],
156+ "max_tokens" : 20 },
157+ ))
158+ # Let the scheduler admit and the worker start
159+ await asyncio .sleep (0.1 )
143160 r = await c .get ("/metrics" )
161+ await post_task
144162 assert r .status_code == 200
145163 assert "scheduler_kv_live_bytes 1.2345678e+07" in r .text or \
146164 "scheduler_kv_live_bytes 12345678" in r .text
147165
148166
167+ async def test_metrics_kv_live_bytes_zero_when_no_active_session (tokenizer ):
168+ """Between turns the verifier may hold residual KV (next prefill
169+ will reset it, but until then it sits in self.cache). Reporting
170+ that as 'live' breaks observability and breaks the §2.3 KV-bounded
171+ check — the residual would carry forward at the previous turn's
172+ peak forever. The gauge must therefore gate on
173+ ``scheduler.active_count > 0``: idle scrape reads 0 even if
174+ engine.kv_state() is non-zero.
175+ """
176+ from tests .inference_engine .server .conftest import DeterministicEngine
177+
178+ class _AlwaysHoldingEngine (DeterministicEngine ):
179+ """Engine whose verifier permanently holds 8 MiB of cache —
180+ simulates the post-turn residual state where the verifier has
181+ not yet been reset by a follow-up prefill."""
182+
183+ def kv_state (self ) -> int :
184+ return 8 * 1024 * 1024
185+
186+ eos = tokenizer .eos_token_id
187+ assert eos is not None
188+ hello = tokenizer ._intern ("hi" )
189+ eng = _AlwaysHoldingEngine (
190+ fixed_tokens = [hello , eos ], tokenizer = tokenizer ,
191+ model_id_label = "residual-holder" ,
192+ )
193+ app = create_app (eng , ServerConfig (max_concurrent = 1 ))
194+ async with AsyncClient (transport = ASGITransport (app = app ),
195+ base_url = "http://t" ) as c :
196+ # No in-flight request → active_count == 0 → gauge gated to 0
197+ r = await c .get ("/metrics" )
198+ assert r .status_code == 200
199+ assert "scheduler_kv_live_bytes 0.0" in r .text
200+ # Crucially, the engine's residual is NOT exposed on the gauge:
201+ assert "scheduler_kv_live_bytes 8388608" not in r .text
202+ assert "scheduler_kv_live_bytes 8.388608e+06" not in r .text
203+
204+
149205# ---------------------------------------------------------------------------
150206# OpenAI error envelope
151207# ---------------------------------------------------------------------------
0 commit comments