Skip to content

Commit d12af68

Browse files
authored
Merge pull request #49 from FluffyAIcode/AgentMemory/v030-pr-d1-remove-adr-0007-server-deadcode-8e7f
PR-D1 (ADR 0008 Phase D): remove ADR 0007 server-side dead code
2 parents bec3d7b + ac533d3 commit d12af68

9 files changed

Lines changed: 78 additions & 540 deletions

File tree

docs/adr/0008-session-bound-runtime-and-grpc-protocol.md

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -775,12 +775,44 @@ parallelize.
775775

776776
### 6.4 Phase D — Deprecated HTTP+SSE shim
777777

778-
- **PR-D1**: Update `inference_engine/server/app.py` so each
779-
`/v1/chat/completions` request creates a single-shot session under
780-
the new `SessionStore`, prefills, generates, and closes. Removes
781-
any path-selection / cross-request logic (none of which exists on
782-
`main` after C3). Adds `Deprecation` / `Sunset` headers. Updates
783-
the existing 461-test integration suite to match.
778+
*(scope split, recorded 2026-06-01 during implementation of PR-D1.)*
779+
780+
The original PR-D1 entry conflated two coupled changes:
781+
782+
(a) Remove the ADR 0007 dead code from the server-side surface
783+
(path_selection metrics, `_emit_path_selection_metric` helper,
784+
`engine_result` field on the scheduler session, etc.).
785+
(b) Refactor the HTTP shim's chat-completions handler onto the new
786+
`SessionStore` so each request becomes a single-shot session
787+
(prefill → generate → close) instead of being driven by the
788+
legacy `PooledVerifier`.
789+
790+
(a) is a pure subtraction: the dead code was reachable only from the
791+
ADR 0007 path_select stack that PR-A3 already removed from the
792+
verifier side; the server-side metrics and helpers it left behind
793+
are unreachable at runtime in any healthy completion. (b) is a
794+
larger refactor of feature-frozen code (per §2.7), with a
795+
corresponding test-update tail.
796+
797+
The two are split, same pattern as PR-A3 / PR-A3b:
798+
799+
- **PR-D1** (this PR, dead-code removal): cleans up §6.6 rows for
800+
`app.py` / `engine.py` / `metrics.py` / `scheduler/session.py` /
801+
`bench_long_session.py`. The HTTP shim continues to use
802+
`PooledVerifier` exactly as before; nothing user-observable
803+
changes except the disappearance of the four ADR 0007 metrics
804+
from `/metrics` and the `acceptance_rate` field from the OpenAI
805+
response (the latter was sourced from `engine_result`, which is
806+
gone). 100% Linux unit coverage.
807+
808+
- **PR-D2** (queued, not in PR-D1's diff): the HTTP-shim refactor
809+
proper. Each `/v1/chat/completions` request creates a single-shot
810+
session under `SessionStore`, prefills, generates, and closes;
811+
`PooledVerifier` is retired. Adds `Deprecation` / `Sunset`
812+
headers per §2.7. Updates the existing integration suite to
813+
match. Linux-only path; §9 carve-out continues to apply. PR-D2
814+
is non-blocking for v0.3 GA — the deprecated shim works on
815+
`main` post-PR-D1 in its v0.3.0-rc1 shape, just lighter.
784816

785817
### 6.5 Phase E — Mac M4 integration test marker + CI workflow
786818

inference_engine/scheduler/scheduler.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -358,12 +358,12 @@ def on_token(tok_id: int) -> bool:
358358
session.eos_token_ids, on_token,
359359
)
360360

361-
# Out of engine lock — finalize state.
362-
# Stash the engine result on the session so route handlers
363-
# can read path-selection observability fields (ADR 0007
364-
# §2.10) and acceptance rate. tokens were already streamed
365-
# via on_token.
366-
session.engine_result = result
361+
# Out of engine lock — finalize state. Tokens were already
362+
# streamed via on_token; the engine result is otherwise
363+
# discarded (PR-D1 of ADR 0008 removed the engine_result
364+
# stash that ADR 0007 §2.10 used for path-selection
365+
# observability).
366+
del result
367367
if session.state == SessionState.CANCELLED:
368368
# Already counted by cancel_session caller; we just
369369
# observe the terminal state here.

inference_engine/scheduler/session.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,6 @@ class Session:
6464
# the scheduler.iter_tokens() async iterator drain this; the
6565
# scheduler's worker pushes into it.
6666
token_queue: asyncio.Queue = field(default_factory=lambda: asyncio.Queue())
67-
# The engine's full result, set by the scheduler worker after
68-
# ``engine.generate()`` returns. Route handlers read this to
69-
# populate ADR 0007 §2.10 path-selection observability metrics
70-
# (path_selection, tokens_skipped, prefill_duration_seconds) and
71-
# acceptance-rate stats. ``None`` until the engine returns —
72-
# callers must check before reading.
73-
engine_result: Optional[object] = None
7467

7568
def __post_init__(self) -> None:
7669
if not self.prompt_ids:

inference_engine/server/app.py

Lines changed: 2 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -442,9 +442,8 @@ async def chat_completions(req: ChatCompletionRequest, request: Request):
442442
metrics.record_completion(
443443
finish_reason=finish_reason,
444444
n_tokens=len(output_token_ids),
445-
acceptance_rate=_session_acceptance_rate(scheduler, session),
445+
acceptance_rate=None,
446446
)
447-
_emit_path_selection_metric(metrics, session)
448447

449448
return JSONResponse(
450449
content=ChatCompletionResponse(
@@ -495,53 +494,6 @@ def _encode_prompt(engine: Engine, req: ChatCompletionRequest) -> List[int]:
495494
return prompt_ids
496495

497496

498-
def _session_acceptance_rate(
499-
scheduler: Scheduler, session: Session,
500-
) -> Optional[float]:
501-
"""Per-session acceptance rate from the stashed EngineResult.
502-
503-
The scheduler worker stores ``engine.generate()``'s result on
504-
``session.engine_result`` after generation completes (PR 7-4).
505-
Returns ``None`` if the result is unavailable (session was
506-
cancelled / failed before the engine returned, or the engine
507-
is a test double that doesn't expose the field).
508-
"""
509-
_ = scheduler # kept for signature stability with existing callers
510-
result = getattr(session, "engine_result", None)
511-
if result is None:
512-
return None
513-
rate = getattr(result, "acceptance_rate", None)
514-
if rate is None:
515-
return None
516-
return float(rate)
517-
518-
519-
def _emit_path_selection_metric(
520-
metrics: "Metrics", session: Session,
521-
) -> None:
522-
"""Emit ADR 0007 §2.10 path-selection observability for one
523-
completed session, if the engine reported the relevant fields.
524-
525-
Called from both the streaming and non-streaming completion
526-
paths after the session reaches a terminal state. No-op when
527-
the engine result is unavailable (e.g., test doubles that
528-
don't populate path_selection).
529-
"""
530-
result = getattr(session, "engine_result", None)
531-
if result is None:
532-
return
533-
path = getattr(result, "path_selection", None)
534-
if path not in ("continuation", "new_session"):
535-
return
536-
metrics.record_path_selection(
537-
path=path,
538-
tokens_skipped=int(getattr(result, "tokens_skipped", 0)),
539-
prefill_duration_s=float(
540-
getattr(result, "prefill_duration_seconds", 0.0)
541-
),
542-
)
543-
544-
545497
async def _collect_non_streaming_tokens(
546498
*,
547499
scheduler: Scheduler,
@@ -662,7 +614,6 @@ def envelope(content_delta, role_delta, finish_reason) -> dict:
662614
metrics.record_completion(
663615
finish_reason=finish_reason,
664616
n_tokens=len(session.output_token_ids),
665-
acceptance_rate=_session_acceptance_rate(scheduler, session),
617+
acceptance_rate=None,
666618
)
667-
_emit_path_selection_metric(metrics, session)
668619
yield {"data": "[DONE]"}

inference_engine/server/engine.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,6 @@ class EngineResult:
4444
proposer_forward_calls: int
4545
verifier_forward_calls: int
4646
stopped_on_eos: bool
47-
# ADR 0007 §2.10 observability — populated by the speculative
48-
# engine; test doubles default to ``new_session`` / 0 so the
49-
# route layer's metric emission code path is exercisable
50-
# against either backend.
51-
path_selection: str = "new_session" # "continuation" | "new_session"
52-
tokens_skipped: int = 0
53-
prefill_duration_seconds: float = 0.0
5447

5548

5649
@runtime_checkable
@@ -191,11 +184,6 @@ def generate(
191184
proposer_forward_calls=int(result.proposer_forward_calls),
192185
verifier_forward_calls=int(result.verifier_forward_calls),
193186
stopped_on_eos=stopped_on_eos,
194-
path_selection=str(getattr(result, "path_selection", "new_session")),
195-
tokens_skipped=int(getattr(result, "tokens_skipped", 0)),
196-
prefill_duration_seconds=float(
197-
getattr(result, "prefill_duration_seconds", 0.0)
198-
),
199187
)
200188

201189
def kv_state(self) -> int:

inference_engine/server/metrics.py

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,6 @@ class Metrics:
107107
scheduler_pending: Gauge
108108
scheduler_kv_live_bytes: Gauge
109109
scheduler_admission_total: Counter
110-
# ADR 0007 §2.10 — cross-request KV reuse observability.
111-
# Both ``path`` labels are first-class outcomes; neither is an
112-
# "error" or "fallback" (per ADR 0007 §2.4.c).
113-
path_selection_total: Counter
114-
continuation_tokens_skipped_total: Counter
115-
verifier_prefill_duration_seconds: Histogram
116-
cache_invariant_violations_total: Counter
117110

118111
@classmethod
119112
def build(cls) -> "Metrics":
@@ -194,47 +187,6 @@ def build(cls) -> "Metrics":
194187
labelnames=["result"],
195188
registry=registry,
196189
),
197-
path_selection_total=Counter(
198-
"path_selection_total",
199-
"Total path-selection decisions made by the verifier "
200-
"for cross-request KV cache reuse (ADR 0007 §2.4). "
201-
"Both 'continuation' and 'new_session' are first-class "
202-
"first-class outcomes; neither is an 'error' or "
203-
"'fallback' (§2.4.c). Healthy long-session agent "
204-
"workloads see continuation rate >= 95%.",
205-
labelnames=["path"],
206-
registry=registry,
207-
),
208-
continuation_tokens_skipped_total=Counter(
209-
"continuation_tokens_skipped_total",
210-
"Cumulative prompt tokens that the continuation path "
211-
"did not need to re-prefill (ADR 0007 §2.10). Sums "
212-
"ContinuationPlan.skip_n across every continuation-"
213-
"path request the server has handled. The win.",
214-
registry=registry,
215-
),
216-
verifier_prefill_duration_seconds=Histogram(
217-
"verifier_prefill_duration_seconds",
218-
"Wall time of the prefill phase of a single request, "
219-
"partitioned by path. Continuation-path histogram "
220-
"centers around per-incremental-token cost; "
221-
"new-session-path histogram tracks full-prefill cost "
222-
"(O(history_length)).",
223-
labelnames=["path"],
224-
buckets=(
225-
0.001, 0.005, 0.01, 0.05, 0.1, 0.5,
226-
1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0,
227-
),
228-
registry=registry,
229-
),
230-
cache_invariant_violations_total=Counter(
231-
"cache_invariant_violations_total",
232-
"Count of ADR 0007 §2.9 INV-1 / INV-2 detections at "
233-
"runtime. Should always read 0; any non-zero value is "
234-
"a critical operational alert (page on it).",
235-
labelnames=["kind"],
236-
registry=registry,
237-
),
238190
)
239191

240192
# ------------------------------------------------------------------
@@ -255,32 +207,6 @@ def record_admission(self, *, admitted: bool) -> None:
255207
result="admitted" if admitted else "rejected"
256208
).inc()
257209

258-
def record_path_selection(self, *, path: str, tokens_skipped: int,
259-
prefill_duration_s: float) -> None:
260-
"""Record one path-selection decision (ADR 0007 §2.10).
261-
262-
``path`` must be ``"continuation"`` or ``"new_session"``. The
263-
method does not validate the label set explicitly because
264-
prometheus-client's ``labels()`` already raises for unknown
265-
labels; we want such a violation to surface loudly per the
266-
no-silent-failure principle.
267-
"""
268-
self.path_selection_total.labels(path=path).inc()
269-
if tokens_skipped > 0:
270-
self.continuation_tokens_skipped_total.inc(tokens_skipped)
271-
self.verifier_prefill_duration_seconds.labels(path=path).observe(
272-
float(prefill_duration_s)
273-
)
274-
275-
def record_cache_invariant_violation(self, *, kind: str) -> None:
276-
"""Record an INV-1 or INV-2 detection (ADR 0007 §2.9).
277-
278-
``kind`` must be ``"inv1"`` or ``"inv2"``. Should never be
279-
called in healthy operation; any increment of this counter
280-
is a critical alert.
281-
"""
282-
self.cache_invariant_violations_total.labels(kind=kind).inc()
283-
284210
def record_completion(self, *, finish_reason: str, n_tokens: int,
285211
acceptance_rate: Optional[float]) -> None:
286212
self.inference_completions_total.labels(

0 commit comments

Comments
 (0)