@@ -92,9 +92,9 @@ async def test_metrics_kv_live_bytes_gauge_present_and_zero_at_idle(
9292 short_engine ,
9393):
9494 """The KV-live-bytes gauge must be exposed and read 0 on an idle
95- pool (every slab has logical_size == 0) . This is the gauge that
96- bench_long_session.py scrapes to verify the ADR 0006 §2.3
97- KV-bounded claim, so its presence is part of the public contract.
95+ engine . This is the gauge that bench_long_session.py scrapes to
96+ verify the ADR 0006 §2.3 KV-bounded claim, so its presence is
97+ part of the public contract.
9898 """
9999 app = create_app (short_engine , ServerConfig (max_concurrent = 2 ))
100100 async with AsyncClient (transport = ASGITransport (app = app ),
@@ -105,6 +105,47 @@ async def test_metrics_kv_live_bytes_gauge_present_and_zero_at_idle(
105105 assert "scheduler_kv_live_bytes 0.0" in text
106106
107107
108+ async def test_metrics_kv_live_bytes_reflects_engine_kv_state (tokenizer ):
109+ """The /metrics handler must read KV bytes from the engine on
110+ every scrape (not from the pool). This is the v0.3 wiring that
111+ makes bench_long_session.py's in-flight scrape produce a
112+ non-zero number on real hardware — without it the gauge
113+ unconditionally reads 0 because no production code path sets
114+ the slab's live_kv_bytes_override.
115+
116+ The 2026-05-30 short test #2 (results/.../bench_long_session_mac_short2_
117+ 1780196477.json) recorded 7313 in-flight samples across 58 turns
118+ with pool_in_use=1 throughout, yet kv_live_bytes was 0.0 in every
119+ sample. This regression test pins the fix.
120+ """
121+ from tests .inference_engine .server .conftest import DeterministicEngine
122+
123+ class _KVAwareEngine (DeterministicEngine ):
124+ def __init__ (self , * args , kv_value : int , ** kwargs ):
125+ super ().__init__ (* args , ** kwargs )
126+ self ._kv_value = kv_value
127+
128+ def kv_state (self ) -> int :
129+ return self ._kv_value
130+
131+ eos = tokenizer .eos_token_id
132+ assert eos is not None
133+ hello = tokenizer ._intern ("hi" )
134+ eng = _KVAwareEngine (
135+ fixed_tokens = [hello , eos ],
136+ tokenizer = tokenizer ,
137+ model_id_label = "kv-aware" ,
138+ kv_value = 12345678 ,
139+ )
140+ app = create_app (eng , ServerConfig (max_concurrent = 1 ))
141+ async with AsyncClient (transport = ASGITransport (app = app ),
142+ base_url = "http://t" ) as c :
143+ r = await c .get ("/metrics" )
144+ assert r .status_code == 200
145+ assert "scheduler_kv_live_bytes 1.2345678e+07" in r .text or \
146+ "scheduler_kv_live_bytes 12345678" in r .text
147+
148+
108149# ---------------------------------------------------------------------------
109150# OpenAI error envelope
110151# ---------------------------------------------------------------------------
0 commit comments