FluffyAIcode
diff --git a/‎inference_engine/backends/mlx/verifier.py‎
Lines changed: 30 additions & 0 deletions b/‎inference_engine/backends/mlx/verifier.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎inference_engine/session/coordinator.py‎
Lines changed: 27 additions & 0 deletions b/‎inference_engine/session/coordinator.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎inference_engine/session/generator.py‎
Lines changed: 11 additions & 1 deletion b/‎inference_engine/session/generator.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎inference_engine/session/store.py‎
Lines changed: 17 additions & 8 deletions b/‎inference_engine/session/store.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎kv_cache_proposer/verifier.py‎
Lines changed: 38 additions & 0 deletions b/‎kv_cache_proposer/verifier.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎scripts/review_pr_e1c_on_mac.sh‎
Lines changed: 181 additions & 0 deletions b/‎scripts/review_pr_e1c_on_mac.sh‎
Lines changed: 181 additions & 0 deletions
@@ -103,6 +103,25 @@ def __init__(self, config: Optional[VerifierConfig] = None) -> None:
         self.quantization: QuantizationInfo = detect_quantization(self.model)
         self.stats = VerifierStats(weight_bytes=self.quantization.total_weight_bytes)
 
+        # PR-E1c: precompute per-K/V-token byte cost for the
+        # ``kv_live_bytes`` accessor. Mirrors the CPU verifier;
+        # reads dims from the wrapped HF config so GQA / MQA via
+        # ``num_key_value_heads`` is honored.
+        cfg = self.model.config if hasattr(self.model, "config") else self.model
+        num_layers = int(getattr(cfg, "num_hidden_layers"))
+        num_kv_heads = int(
+            getattr(cfg, "num_key_value_heads", None)
+            or getattr(cfg, "num_attention_heads")
+        )
+        head_dim = int(
+            getattr(cfg, "head_dim", None)
+            or (cfg.hidden_size // cfg.num_attention_heads)
+        )
+        itemsize = torch.tensor([], dtype=self.config.dtype).element_size()
+        self._bytes_per_kv_token = (
+            num_layers * num_kv_heads * head_dim * itemsize * 2
+        )
+
     # ---------------------------- public API ---------------------------- #
 
     def reset(self) -> None:
@@ -222,6 +241,17 @@ def k_seq_length(self, session: object) -> int:
         del session  # unused in v0.3 single-tenant scope
         return self._cache_buffer_size()
 
+    def kv_live_bytes(self, session: object) -> int:
+        """Return the live K/V cache size in bytes for ``session``.
+
+        Mirrors the CPU verifier's :meth:`kv_live_bytes`; computed as
+        ``k_seq_length × num_layers × num_kv_heads × head_dim ×
+        itemsize × 2``. PR-E1c — feeds ``GetSessionInfo.kv_live_bytes``
+        through the coordinator's slab-write-through.
+        """
+        del session  # unused in v0.3 single-tenant scope
+        return self._cache_buffer_size() * self._bytes_per_kv_token
+
     # --------------------------- internals --------------------------- #
 
     def _cache_buffer_size(self) -> int:
 
@@ -96,6 +96,28 @@ def commit_or_truncate(self, *, forwarded: int, accepted: int) -> None:
     def k_seq_length(self, session: Session) -> int:
         ...  # pragma: no cover - Protocol body, never executed
 
+    def kv_live_bytes(self, session: Session) -> int:
+        ...  # pragma: no cover - Protocol body, never executed
+
+
+def _sync_slab_bytes(session: Session, verifier: "VerifierProtocol") -> None:
+    """Mirror the verifier's current KV byte count onto the session's
+    slab placeholder (PR-E1c).
+
+    The slab's ``live_kv_bytes`` is the source of truth for
+    :meth:`Session.kv_live_bytes`, which in turn feeds
+    ``GetSessionInfo.kv_live_bytes`` over gRPC. The verifier owns
+    the actual K/V tensors; the slab is a placeholder that holds
+    one capacity unit per active session. Without this sync the
+    gauge reads 0 forever (PR-E1b's 4h bench surfaced this).
+
+    No-op when the session has no slab (pool-less SessionStore — the
+    test / pure-data-layer mode the coordinator unit tests use).
+    """
+    if session.slab is None:
+        return
+    session.slab.live_kv_bytes_override = int(verifier.kv_live_bytes(session))
+
 
 class AppendTokensCoordinator:
     """Orchestrator for the §2.3 byte-exact prefill-incremental contract.
@@ -194,4 +216,9 @@ def append_tokens(
             session_id, self._verifier.next_global_position,
         )
 
+        # Mirror the verifier's current KV byte count onto the slab
+        # so GetSessionInfo.kv_live_bytes reports physical bytes
+        # rather than the slab's placeholder zero. PR-E1c.
+        _sync_slab_bytes(session, self._verifier)
+
         return new_history_length
@@ -51,7 +51,10 @@
 
 import torch
 
-from inference_engine.session.coordinator import VerifierProtocol
+from inference_engine.session.coordinator import (
+    VerifierProtocol,
+    _sync_slab_bytes,
+)
 from inference_engine.session.store import SessionStore
 
 
@@ -226,6 +229,12 @@ def generate(
             yield TokenEvent(token_id=next_token)
 
             if next_token in eos_set:
+                # Mirror final KV bytes onto the slab so the next
+                # GetSessionInfo reads the correct live count
+                # (PR-E1c). Once the cache is at sink+window
+                # capacity, this value plateaus and the caller can
+                # observe the architectural KV bound empirically.
+                _sync_slab_bytes(session, self._verifier)
                 yield DoneEvent(
                     stop_reason=STOP_REASON_EOS,
                     generated_token_count=generated_count,
@@ -234,6 +243,7 @@ def generate(
                 )
                 return
 
+        _sync_slab_bytes(session, self._verifier)
         yield DoneEvent(
             stop_reason=STOP_REASON_MAX_TOKENS,
             generated_token_count=generated_count,
 
@@ -182,14 +182,23 @@ def idle_seconds(self) -> float:
         return time.monotonic() - self.last_active_at
 
     def kv_live_bytes(self) -> int:
-        """Live KV bytes held by this session's slab.
-
-        Returns the slab's reported live KV bytes (the verifier
-        wiring keeps ``slab.live_kv_bytes_override`` synced to its
-        real ``stats.peak_kv_bytes`` snapshot — see
-        ``PooledVerifier._sync_slab_bytes`` for the existing CPU/MLX
-        contract that PR-A3b reuses). When the session has no slab
-        (pool-less store), returns 0.
+        """Live KV bytes held by this session's KV cache.
+
+        Returns the slab's ``live_kv_bytes_override`` field, which is
+        kept in sync with the verifier's true cache size by:
+
+          * The HTTP shim's :class:`PooledVerifier._sync_slab_bytes`
+            (writes ``verifier.stats.peak_kv_bytes`` after every
+            forward — running max).
+          * The gRPC path's coordinator-level ``_sync_slab_bytes``
+            helper (writes ``verifier.kv_live_bytes(session)`` after
+            every forward — current live bytes; PR-E1c).
+
+        The ``Session`` object itself never knows about the verifier;
+        the slab is the single piece of session-bound state that
+        bridges the verifier and the gRPC ``GetSessionInfo`` field.
+
+        Returns 0 when the session has no slab (pool-less store).
         """
         if self.slab is None:
             return 0
 
@@ -98,6 +98,27 @@ def __init__(self, config: Optional[VerifierConfig] = None) -> None:
             weight_bytes=sum(p.numel() * p.element_size() for p in self.model.parameters())
         )
 
+        # PR-E1c: precompute per-K/V-token byte cost so the
+        # ``kv_live_bytes`` accessor is O(1). Two factors of 2 — one
+        # for K + V, one already absorbed into the dim product. Read
+        # the dims from the HF config so GQA / MQA variants
+        # (Qwen3 / Gemma / DeepSeek) are accounted for correctly via
+        # ``num_key_value_heads`` rather than ``num_attention_heads``.
+        cfg = self.model.config
+        num_layers = int(getattr(cfg, "num_hidden_layers"))
+        num_kv_heads = int(
+            getattr(cfg, "num_key_value_heads", None)
+            or getattr(cfg, "num_attention_heads")
+        )
+        head_dim = int(
+            getattr(cfg, "head_dim", None)
+            or (cfg.hidden_size // cfg.num_attention_heads)
+        )
+        itemsize = torch.tensor([], dtype=self.config.dtype).element_size()
+        self._bytes_per_kv_token = (
+            num_layers * num_kv_heads * head_dim * itemsize * 2
+        )
+
     # ---------------------------- public API ---------------------------- #
     def reset(self) -> None:
         self.cache = DynamicCache(config=self.model.config)
@@ -320,6 +341,23 @@ def k_seq_length(self, session: object) -> int:
         del session  # unused in v0.3 single-tenant scope
         return self._cache_seq_length()
 
+    def kv_live_bytes(self, session: object) -> int:
+        """Return the live K/V cache size in bytes for ``session``.
+
+        Implements the :class:`VerifierProtocol.kv_live_bytes` contract
+        introduced by PR-E1c. Computed as
+        ``k_seq_length × num_layers × num_kv_heads × head_dim ×
+        itemsize × 2`` (the trailing 2 = K + V).
+
+        After PR-E1c the gRPC ``GetSessionInfo.kv_live_bytes`` field is
+        sourced from this method via the coordinator's slab-write-
+        through (PR-E1b's 4h bench surfaced that the previous source —
+        ``slab.live_kv_bytes`` — was always 0 because the slab is a
+        capacity placeholder, not a real KV-tensor sink).
+        """
+        del session  # unused in v0.3 single-tenant scope
+        return self._cache_seq_length() * self._bytes_per_kv_token
+
     def _assert_cache_invariant_1(self) -> None:
         """ADR 0007 §2.9 INV-1: parallel-sequence consistency.
 
 
@@ -0,0 +1,181 @@
+#!/usr/bin/env bash
+# Mac M4 review aid for PR-E1c (kv_live_bytes reporting fix).
+#
+# This PR closes the GetSessionInfo.kv_live_bytes=0 reporting bug
+# PR-E1b's 4-hour bench surfaced. The Linux unit gate exercises the
+# coordinator-level slab-write-through against a deterministic
+# FakeVerifier. The Mac M4 review here adds two further checks:
+#
+#   1. The CPU verifier's kv_live_bytes accessor against real
+#      Qwen3-0.6B numerics — non-zero, plateaus at sink+window
+#      capacity, equals k_seq_length × per-token bytes.
+#   2. A short (5-min) gRPC bench run that confirms
+#      GetSessionInfo.kv_live_bytes is no longer 0 over the wire.
+#
+# Produces 2 artifacts:
+#
+#   results/platform-tests/pr-e1c-mac-verifier-tests-<unix>.json
+#     pytest tests/core/test_verifier.py + tests/backends/mlx/test_verifier.py
+#     (the kv_live_bytes-related tests + INV-1 baseline).
+#
+#   results/platform-tests/pr-e1c-mac-bench-session-5min-<unix>.json
+#     bench_session_long_run.py @ 300s. Purpose: visually confirm
+#     kv_live_bytes goes 0 -> capped multi-MB once cache hits
+#     sink+window. Expected: kv_bounded=True, prefill_bounded=True,
+#     min/mean/max kv_live_bytes all > 0.
+#
+# Usage (from repo root, on Mac M4):
+#
+#     bash scripts/review_pr_e1c_on_mac.sh
+#
+# Then commit:
+#
+#     git add results/platform-tests/pr-e1c-mac-*
+#     git commit -m "Mac M4 review evidence for PR-E1c"
+#     git push
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+stamp="$(date +%s)"
+out_dir="results/platform-tests"
+mkdir -p "$out_dir"
+
+# --- Part 1: verifier-level tests -----------------------------------------
+verif_junit="$out_dir/pr-e1c-mac-verifier-tests-${stamp}.junit.xml"
+verif_report="$out_dir/pr-e1c-mac-verifier-tests-${stamp}.json"
+
+echo "==> CPU + MLX verifier tests covering kv_live_bytes (PR-E1c)"
+PYTHONPATH=.:sdks/python python3 -m pytest \
+    tests/core/test_verifier.py \
+    tests/backends/mlx/test_verifier.py \
+    -k "kv_live_bytes or k_seq_length or cache_inspector" \
+    --junitxml="$verif_junit" \
+    -v
+
+PYTHONPATH=.:sdks/python python3 - "$verif_junit" "$verif_report" <<'PY'
+import json
+import platform
+import sys
+import xml.etree.ElementTree as ET
+junit_path, out_path = sys.argv[1:3]
+jr = ET.parse(junit_path).getroot()
+testsuites = list(jr.iter("testsuite"))
+total_tests = sum(int(ts.get("tests", "0")) for ts in testsuites)
+total_failures = sum(int(ts.get("failures", "0")) for ts in testsuites)
+total_errors = sum(int(ts.get("errors", "0")) for ts in testsuites)
+total_skipped = sum(int(ts.get("skipped", "0")) for ts in testsuites)
+report = {
+    "schema_version": 1,
+    "kind": "pr_e1c_mac_verifier_tests",
+    "host": {
+        "platform": platform.platform(),
+        "machine": platform.machine(),
+        "python": platform.python_version(),
+    },
+    "junit": {
+        "tests": total_tests, "failures": total_failures,
+        "errors": total_errors, "skipped": total_skipped,
+    },
+}
+with open(out_path, "w", encoding="utf-8") as fh:
+    json.dump(report, fh, indent=2)
+print(f"  -> {out_path}")
+PY
+
+# --- Part 2: 5-min gRPC bench ---------------------------------------------
+# This part requires PR-E1b's scripts/start_grpc_runtime_server.py and
+# scripts/bench_agentic/bench_session_long_run.py to be present on the
+# checked-out tree. PR-E1c merges *after* PR-E1b in the recommended
+# sequence; if PR-E1c is exercised against a tree where PR-E1b hasn't
+# landed yet, skip the bench gracefully so Part 1 evidence still
+# commits cleanly.
+if [[ ! -f scripts/start_grpc_runtime_server.py \
+   || ! -f scripts/bench_agentic/bench_session_long_run.py ]]; then
+    echo
+    echo "==> Part 2 skipped: PR-E1b artifacts not present on this tree."
+    echo "    Re-run after PR-E1b lands to capture the bench evidence."
+    echo
+    echo "==> Done. Commit Part 1 evidence:"
+    echo "    git add $out_dir/pr-e1c-mac-verifier-tests-${stamp}.*"
+    echo "    git commit -m 'Mac M4 review evidence for PR-E1c (verifier tests)'"
+    echo "    git push"
+    exit 0
+fi
+
+bench_json="$out_dir/pr-e1c-mac-bench-session-5min-${stamp}.json"
+server_log="$out_dir/pr-e1c-mac-bench-session-5min-${stamp}.server.log"
+
+server_pid=""
+cleanup() {
+    if [[ -n "$server_pid" ]] && kill -0 "$server_pid" 2>/dev/null; then
+        kill "$server_pid" 2>/dev/null || true
+        wait "$server_pid" 2>/dev/null || true
+    fi
+}
+trap cleanup EXIT
+
+echo
+echo "==> starting gRPC server (logs: $server_log)"
+PYTHONPATH=.:sdks/python python3 scripts/start_grpc_runtime_server.py \
+    --backend cpu --verifier-id Qwen/Qwen3-0.6B \
+    --bind 127.0.0.1:50051 --capacity 1 --sink 4 --window 64 \
+    >"$server_log" 2>&1 &
+server_pid=$!
+
+ready=0
+for _ in $(seq 1 60); do
+    if grep -q "kakeya gRPC RuntimeService listening on" "$server_log" 2>/dev/null; then
+        ready=1
+        break
+    fi
+    sleep 1
+done
+
+if [[ "$ready" != "1" ]]; then
+    echo "!!! gRPC server didn't become ready"
+    tail -20 "$server_log" || true
+    exit 1
+fi
+
+echo "==> running 5-min bench (validates kv_live_bytes is non-zero)"
+PYTHONPATH=.:sdks/python python3 \
+    scripts/bench_agentic/bench_session_long_run.py \
+    --grpc-address 127.0.0.1:50051 \
+    --tokenizer-id Qwen/Qwen3-0.6B \
+    --duration-s 300 --turn-spacing-s 30 \
+    --max-tokens 64 \
+    --output "$bench_json"
+
+echo
+echo "==> Headline KPIs from $bench_json:"
+PYTHONPATH=.:sdks/python python3 - "$bench_json" <<'PY'
+import json
+import sys
+with open(sys.argv[1], encoding="utf-8") as fh:
+    payload = json.load(fh)
+agg = payload["agg"]
+print(f"    n_turns         = {agg['n_turns']}")
+print(f"    n_errors        = {agg['n_errors']}")
+print(f"    p50_latency_s   = {agg['p50_latency_s']}")
+print(f"    kv min/mean/max = "
+      f"{agg['min_kv_live_bytes']} / "
+      f"{agg['mean_kv_live_bytes']} / "
+      f"{agg['max_kv_live_bytes']}")
+print(f"    kv_bounded      = {agg['kv_bounded']}")
+print(f"    prefill_bounded = {agg['prefill_bounded']}")
+m = agg["max_kv_live_bytes"]
+if m and m > 0:
+    print(f"    -> kv_live_bytes is non-zero; PR-E1c reporting fix VERIFIED.")
+else:
+    print(f"    -> kv_live_bytes is still 0; PR-E1c FAILED.")
+    sys.exit(1)
+PY
+
+echo
+echo "==> Done. Commit:"
+echo "    git add $out_dir/pr-e1c-mac-*"
+echo "    git commit -m 'Mac M4 review evidence for PR-E1c'"
+echo "    git push"