FluffyAIcode
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/adr/0008-session-bound-runtime-and-grpc-protocol.md‎
Lines changed: 38 additions & 6 deletions b/‎docs/adr/0008-session-bound-runtime-and-grpc-protocol.md‎
Lines changed: 38 additions & 6 deletions
diff --git a/‎inference_engine/backends/mlx/verifier.py‎
Lines changed: 30 additions & 0 deletions b/‎inference_engine/backends/mlx/verifier.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎inference_engine/bench/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎inference_engine/bench/__init__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎inference_engine/bench/session_long_run.py‎
Lines changed: 210 additions & 0 deletions b/‎inference_engine/bench/session_long_run.py‎
Lines changed: 210 additions & 0 deletions
diff --git a/‎inference_engine/scheduler/scheduler.py‎
Lines changed: 6 additions & 6 deletions b/‎inference_engine/scheduler/scheduler.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎inference_engine/scheduler/session.py‎
Lines changed: 0 additions & 7 deletions b/‎inference_engine/scheduler/session.py‎
Lines changed: 0 additions & 7 deletions
@@ -94,6 +94,7 @@ jobs:
             tests/inference_engine/scheduler/ \
             tests/inference_engine/pipeline/ \
             tests/inference_engine/session/ \
+            tests/inference_engine/bench/ \
             tests/sdk/python/ \
             tests/training/repr_align/ \
             tests/backends/mlx/test_env.py \
 
@@ -775,12 +775,44 @@ parallelize.
 
 ### 6.4 Phase D — Deprecated HTTP+SSE shim
 
-- **PR-D1**: Update `inference_engine/server/app.py` so each
-  `/v1/chat/completions` request creates a single-shot session under
-  the new `SessionStore`, prefills, generates, and closes. Removes
-  any path-selection / cross-request logic (none of which exists on
-  `main` after C3). Adds `Deprecation` / `Sunset` headers. Updates
-  the existing 461-test integration suite to match.
+*(scope split, recorded 2026-06-01 during implementation of PR-D1.)*
+
+The original PR-D1 entry conflated two coupled changes:
+
+  (a) Remove the ADR 0007 dead code from the server-side surface
+      (path_selection metrics, `_emit_path_selection_metric` helper,
+      `engine_result` field on the scheduler session, etc.).
+  (b) Refactor the HTTP shim's chat-completions handler onto the new
+      `SessionStore` so each request becomes a single-shot session
+      (prefill → generate → close) instead of being driven by the
+      legacy `PooledVerifier`.
+
+(a) is a pure subtraction: the dead code was reachable only from the
+ADR 0007 path_select stack that PR-A3 already removed from the
+verifier side; the server-side metrics and helpers it left behind
+are unreachable at runtime in any healthy completion. (b) is a
+larger refactor of feature-frozen code (per §2.7), with a
+corresponding test-update tail.
+
+The two are split, same pattern as PR-A3 / PR-A3b:
+
+- **PR-D1** (this PR, dead-code removal): cleans up §6.6 rows for
+  `app.py` / `engine.py` / `metrics.py` / `scheduler/session.py` /
+  `bench_long_session.py`. The HTTP shim continues to use
+  `PooledVerifier` exactly as before; nothing user-observable
+  changes except the disappearance of the four ADR 0007 metrics
+  from `/metrics` and the `acceptance_rate` field from the OpenAI
+  response (the latter was sourced from `engine_result`, which is
+  gone). 100% Linux unit coverage.
+
+- **PR-D2** (queued, not in PR-D1's diff): the HTTP-shim refactor
+  proper. Each `/v1/chat/completions` request creates a single-shot
+  session under `SessionStore`, prefills, generates, and closes;
+  `PooledVerifier` is retired. Adds `Deprecation` / `Sunset`
+  headers per §2.7. Updates the existing integration suite to
+  match. Linux-only path; §9 carve-out continues to apply. PR-D2
+  is non-blocking for v0.3 GA — the deprecated shim works on
+  `main` post-PR-D1 in its v0.3.0-rc1 shape, just lighter.
 
 ### 6.5 Phase E — Mac M4 integration test marker + CI workflow
 
 
@@ -103,6 +103,25 @@ def __init__(self, config: Optional[VerifierConfig] = None) -> None:
         self.quantization: QuantizationInfo = detect_quantization(self.model)
         self.stats = VerifierStats(weight_bytes=self.quantization.total_weight_bytes)
 
+        # PR-E1c: precompute per-K/V-token byte cost for the
+        # ``kv_live_bytes`` accessor. Mirrors the CPU verifier;
+        # reads dims from the wrapped HF config so GQA / MQA via
+        # ``num_key_value_heads`` is honored.
+        cfg = self.model.config if hasattr(self.model, "config") else self.model
+        num_layers = int(getattr(cfg, "num_hidden_layers"))
+        num_kv_heads = int(
+            getattr(cfg, "num_key_value_heads", None)
+            or getattr(cfg, "num_attention_heads")
+        )
+        head_dim = int(
+            getattr(cfg, "head_dim", None)
+            or (cfg.hidden_size // cfg.num_attention_heads)
+        )
+        itemsize = torch.tensor([], dtype=self.config.dtype).element_size()
+        self._bytes_per_kv_token = (
+            num_layers * num_kv_heads * head_dim * itemsize * 2
+        )
+
     # ---------------------------- public API ---------------------------- #
 
     def reset(self) -> None:
@@ -222,6 +241,17 @@ def k_seq_length(self, session: object) -> int:
         del session  # unused in v0.3 single-tenant scope
         return self._cache_buffer_size()
 
+    def kv_live_bytes(self, session: object) -> int:
+        """Return the live K/V cache size in bytes for ``session``.
+
+        Mirrors the CPU verifier's :meth:`kv_live_bytes`; computed as
+        ``k_seq_length × num_layers × num_kv_heads × head_dim ×
+        itemsize × 2``. PR-E1c — feeds ``GetSessionInfo.kv_live_bytes``
+        through the coordinator's slab-write-through.
+        """
+        del session  # unused in v0.3 single-tenant scope
+        return self._cache_buffer_size() * self._bytes_per_kv_token
+
     # --------------------------- internals --------------------------- #
 
     def _cache_buffer_size(self) -> int:
 
@@ -0,0 +1,7 @@
+"""Pure-Python aggregation helpers used by ``scripts/bench_agentic/``.
+
+These helpers are split out of the CLI scripts so they can be unit-
+tested under the Linux 100% coverage gate. The CLI scripts that
+import them are themselves exempt from the coverage gate (CLI
+plumbing convention; see ``scripts/serve.py`` for precedent).
+"""
@@ -0,0 +1,210 @@
+"""Pure aggregation helpers for the gRPC long-session bench.
+
+The bench script under ``scripts/bench_agentic/bench_session_long_run.py``
+walks one gRPC session through many turns, recording per-turn
+metrics: latency, KV bytes, history length, error / success. After
+the run it calls :func:`aggregate_run` here to compute the headline
+KPIs:
+
+  * ``kv_bounded`` — does ``kv_live_bytes`` stay under a tight band
+    across all turns? (ADR 0006 §2.3.a, ADR 0008 §7 G2.)
+  * ``prefill_bounded`` — does per-turn latency stay flat as the
+    history grows? (ADR 0008 §7 G2 prefill claim, the v0.3 GA gate
+    that was a non-claim on the deprecated HTTP shim.)
+  * Latency p50/p95, KV min/mean/max, n_turns, n_errors.
+
+Splitting this out of the CLI script means the aggregation logic is
+fully unit-testable and the script itself stays focused on IO. The
+script also computes a 10-minute bucket breakdown for visual sanity-
+check on long runs (4h+); that bucketing logic lives here too.
+"""
+
+from __future__ import annotations
+
+import statistics
+from typing import Any, Dict, List, Optional
+
+
+# ---------------------------------------------------------------------------
+# Aggregation
+# ---------------------------------------------------------------------------
+
+
+def _percentile(values: List[float], pct: float) -> Optional[float]:
+    """Linear-interpolated percentile, ``None`` if input is empty.
+
+    Implemented locally instead of pulling in ``numpy`` so the bench
+    has no scientific-stack dependency.
+    """
+    if not values:
+        return None
+    if not 0.0 <= pct <= 1.0:
+        raise ValueError(f"pct must be in [0, 1], got {pct}")
+    sorted_values = sorted(values)
+    if len(sorted_values) == 1:
+        return float(sorted_values[0])
+    rank = pct * (len(sorted_values) - 1)
+    lo = int(rank)
+    hi = min(lo + 1, len(sorted_values) - 1)
+    frac = rank - lo
+    return float(sorted_values[lo] + (sorted_values[hi] - sorted_values[lo]) * frac)
+
+
+def _kv_bounded(kv_values: List[int], *, tolerance: float = 0.10) -> Optional[bool]:
+    """Returns ``True`` iff the KV-bytes series stays within
+    ``tolerance`` (default 10%) of its minimum across every turn.
+
+    Returns ``None`` when there are not enough successful turns to
+    answer (≤1 sample). The tolerance is a relative band — if the
+    minimum is 0 we treat that as a pathologically small denominator
+    and use ``max(min, 1)`` to avoid div-by-zero, the same convention
+    ``bench_long_session.py`` uses.
+    """
+    if len(kv_values) <= 1:
+        return None
+    lo = min(kv_values)
+    hi = max(kv_values)
+    return (hi - lo) / max(lo, 1) < tolerance
+
+
+def _prefill_bounded(
+    latencies: List[float],
+    *,
+    head_window: int = 5,
+    tail_window: int = 5,
+    drift_threshold_s: float = 5.0,
+) -> Optional[bool]:
+    """Returns ``True`` iff median per-turn latency on the LAST
+    ``tail_window`` turns is within ``drift_threshold_s`` seconds of
+    the median on the FIRST ``head_window`` turns.
+
+    This is the prefill-bounded contract: a healthy session-bound
+    runtime processes only the new user message per turn, so latency
+    should not grow with conversation length. On the deprecated HTTP
+    shim, by contrast, every turn re-prefills the full history and
+    latency grows linearly — that's the failure mode this metric
+    catches.
+
+    ``None`` when the run is too short to bracket head and tail
+    windows without overlap.
+    """
+    if len(latencies) < head_window + tail_window:
+        return None
+    head = latencies[:head_window]
+    tail = latencies[-tail_window:]
+    head_p50 = statistics.median(head)
+    tail_p50 = statistics.median(tail)
+    return (tail_p50 - head_p50) <= drift_threshold_s
+
+
+def _latency_drift_p50_s(
+    latencies: List[float],
+    *,
+    head_window: int = 5,
+    tail_window: int = 5,
+) -> Optional[float]:
+    """Drift in seconds between head-window p50 and tail-window p50.
+
+    Positive = latency grew over the run. Returns ``None`` for
+    runs too short to bracket head and tail without overlap.
+    """
+    if len(latencies) < head_window + tail_window:
+        return None
+    head = latencies[:head_window]
+    tail = latencies[-tail_window:]
+    return float(statistics.median(tail) - statistics.median(head))
+
+
+def _bucketize_10min(turns: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Partition successful turns by their wall-clock bucket
+    (10-minute granularity, indexed from 0). Each bucket reports
+    ``n_turns``, p50/p95 latency, and mean kv_live_bytes — gives a
+    visual sanity check of latency / memory drift across a long run.
+
+    Empty input or all-error input returns an empty list.
+    """
+    buckets: Dict[int, List[Dict[str, Any]]] = {}
+    for t in turns:
+        if not t.get("ok"):
+            continue
+        bucket_idx = int(t["t_relative_s"] // 600)
+        buckets.setdefault(bucket_idx, []).append(t)
+
+    out: List[Dict[str, Any]] = []
+    for idx in sorted(buckets):
+        items = buckets[idx]
+        latencies = [float(t["latency_s"]) for t in items]
+        kv_values = [
+            int(t["kv_live_bytes"]) for t in items
+            if t.get("kv_live_bytes") is not None
+        ]
+        out.append(
+            {
+                "bucket_index": idx,
+                "n_turns": len(items),
+                "p50_latency_s": _percentile(latencies, 0.50),
+                "p95_latency_s": _percentile(latencies, 0.95),
+                "mean_kv_live_bytes": (
+                    statistics.mean(kv_values) if kv_values else None
+                ),
+            }
+        )
+    return out
+
+
+def aggregate_run(
+    turns: List[Dict[str, Any]],
+    *,
+    duration_s: float,
+    kv_tolerance: float = 0.10,
+    drift_head_window: int = 5,
+    drift_tail_window: int = 5,
+    drift_threshold_s: float = 5.0,
+) -> Dict[str, Any]:
+    """Build the aggregate report from a list of per-turn records.
+
+    Each turn dict must carry at least:
+      * ``ok`` — bool
+      * ``t_relative_s`` — float, seconds since run start
+      * ``latency_s`` — float (only if ``ok``)
+      * ``kv_live_bytes`` — int or ``None``  (only if ``ok``)
+
+    Returns a dict with the headline KPIs ADR 0006 §2.3.a / ADR 0008
+    §7 G2 speak to: ``kv_bounded``, ``prefill_bounded``, latency
+    p50/p95, kv min/mean/max, error count, 10-minute bucket break-
+    down.
+    """
+    successes = [t for t in turns if t.get("ok")]
+    errors = [t for t in turns if not t.get("ok")]
+
+    latencies = [float(t["latency_s"]) for t in successes]
+    kv_values = [
+        int(t["kv_live_bytes"]) for t in successes
+        if t.get("kv_live_bytes") is not None
+    ]
+
+    return {
+        "n_turns": len(successes),
+        "n_errors": len(errors),
+        "duration_s": float(duration_s),
+        "p50_latency_s": _percentile(latencies, 0.50),
+        "p95_latency_s": _percentile(latencies, 0.95),
+        "min_kv_live_bytes": min(kv_values) if kv_values else None,
+        "mean_kv_live_bytes": (
+            statistics.mean(kv_values) if kv_values else None
+        ),
+        "max_kv_live_bytes": max(kv_values) if kv_values else None,
+        "kv_bounded": _kv_bounded(kv_values, tolerance=kv_tolerance),
+        "prefill_bounded": _prefill_bounded(
+            latencies,
+            head_window=drift_head_window,
+            tail_window=drift_tail_window,
+            drift_threshold_s=drift_threshold_s,
+        ),
+        "latency_drift_p50_s": _latency_drift_p50_s(
+            latencies,
+            head_window=drift_head_window,
+            tail_window=drift_tail_window,
+        ),
+        "buckets_10min": _bucketize_10min(turns),
+    }
@@ -358,12 +358,12 @@ def on_token(tok_id: int) -> bool:
                     session.eos_token_ids, on_token,
                 )
 
-            # Out of engine lock — finalize state.
-            # Stash the engine result on the session so route handlers
-            # can read path-selection observability fields (ADR 0007
-            # §2.10) and acceptance rate. tokens were already streamed
-            # via on_token.
-            session.engine_result = result
+            # Out of engine lock — finalize state. Tokens were already
+            # streamed via on_token; the engine result is otherwise
+            # discarded (PR-D1 of ADR 0008 removed the engine_result
+            # stash that ADR 0007 §2.10 used for path-selection
+            # observability).
+            del result
             if session.state == SessionState.CANCELLED:
                 # Already counted by cancel_session caller; we just
                 # observe the terminal state here.
 
@@ -64,13 +64,6 @@ class Session:
     # the scheduler.iter_tokens() async iterator drain this; the
     # scheduler's worker pushes into it.
     token_queue: asyncio.Queue = field(default_factory=lambda: asyncio.Queue())
-    # The engine's full result, set by the scheduler worker after
-    # ``engine.generate()`` returns. Route handlers read this to
-    # populate ADR 0007 §2.10 path-selection observability metrics
-    # (path_selection, tokens_skipped, prefill_duration_seconds) and
-    # acceptance-rate stats. ``None`` until the engine returns —
-    # callers must check before reading.
-    engine_result: Optional[object] = None
 
     def __post_init__(self) -> None:
         if not self.prompt_ids: