Merge pull request #31 from FluffyAIcode/AgentMemory/v030-pr7-1-cache-parallel-token-seq-8e7f

FluffyAIcode · web-flow · commit 592015502d24 · 2026-05-31T22:07:55.000+08:00
PR 7-1 (ADR 0007): cached_token_sequence + INV-1 on both verifiers
diff --git a/inference_engine/backends/mlx/verifier.py b/inference_engine/backends/mlx/verifier.py
@@ -91,6 +91,14 @@ def __init__(self, config: Optional[VerifierConfig] = None) -> None:
         self.cache_logical_size: int = 0
         self.next_global_position: int = 0
         self.next_token_logits: Optional[torch.Tensor] = None
+        # Parallel record of the token id at every K/V cache slot, in
+        # the same physical order as ``self.cache[*].keys``. See the CPU
+        # verifier for the full motivation; in short, this is required
+        # by ADR 0007 §2.2 (path-selection needs token-id-level
+        # comparison against the cache) and §2.9 INV-1 (parallel-
+        # sequence consistency). Maintained synchronously with the
+        # K/V tensors by every cache mutation method below.
+        self.cached_token_sequence: List[int] = []
 
         self.quantization: QuantizationInfo = detect_quantization(self.model)
         self.stats = VerifierStats(weight_bytes=self.quantization.total_weight_bytes)
@@ -106,6 +114,8 @@ def reset(self) -> None:
         self.cache_logical_size = 0
         self.next_global_position = 0
         self.next_token_logits = None
+        self.cached_token_sequence = []
+        self._assert_cache_invariant_1()
 
     def prefill(self, prompt_ids: List[int]) -> None:
         if not prompt_ids:
@@ -125,9 +135,16 @@ def prefill(self, prompt_ids: List[int]) -> None:
         self.next_token_logits = mx_to_torch(logits_mx[0, -1])
         self.next_global_position = L
         self.cache_logical_size = self._cache_buffer_size()
+        # Compute the post-trim parallel token sequence directly. The
+        # MLX SinkWindowKVCache trims inside update_and_fetch on every
+        # forward, so by the time we get here the per-layer K/V tensors
+        # already hold the sink+window slice of ``prompt_ids``. We
+        # mirror that slice on cached_token_sequence so INV-1 holds.
+        self.cached_token_sequence = self._sink_window_slice(prompt_ids)
         self._record_peak_kv()
         self.stats.forward_calls += 1
         self.stats.tokens_consumed += L
+        self._assert_cache_invariant_1()
 
     def forward_block(self, tokens: List[int]) -> torch.Tensor:
         if self.cache is None:
@@ -150,9 +167,15 @@ def forward_block(self, tokens: List[int]) -> torch.Tensor:
         # SinkWindowKVCache.update_and_fetch trim. Read directly from
         # the cache rather than tracking it ourselves.
         self.cache_logical_size = self._cache_buffer_size()
+        # Mirror the same trim on the parallel sequence: take the
+        # current sequence concatenated with the new tokens and apply
+        # the sink+window slice.
+        extended = self.cached_token_sequence + list(tokens)
+        self.cached_token_sequence = self._sink_window_slice(extended)
         block_logits = mx_to_torch(logits_mx[0])  # [L, V]
         self.stats.forward_calls += 1
         self.stats.tokens_consumed += L
+        self._assert_cache_invariant_1()
         return block_logits
 
     def commit_or_truncate(self, forwarded: int, accepted: int) -> None:
@@ -169,9 +192,12 @@ def commit_or_truncate(self, forwarded: int, accepted: int) -> None:
                     f"per-layer trim mismatch (asked drop={drop}, got {trims}); "
                     "SinkWindowKVCache state diverged across layers"
                 )
+            # Mirror the tail truncation on the parallel sequence.
+            self.cached_token_sequence = self.cached_token_sequence[:-drop]
         self.cache_logical_size = self._cache_buffer_size()
         self.next_global_position += accepted
         self._record_peak_kv()
+        self._assert_cache_invariant_1()
 
     def append_token(self, token_id: int) -> torch.Tensor:
         logits = self.forward_block([token_id])
@@ -187,6 +213,46 @@ def _cache_buffer_size(self) -> int:
             return 0
         return cache_ops.cache_seq_length(self.cache)
 
+    def _sink_window_slice(self, sequence: List[int]) -> List[int]:
+        """Return ``sequence`` after the sink+window trim that the K/V
+        cache applies.
+
+        Mirrors ``SinkWindowKVCache.update_and_fetch``'s trim logic at
+        the token-id level: if the input length exceeds the budget,
+        keep the first ``sink_size`` entries and the last
+        ``window_size`` entries; otherwise return unchanged.
+        """
+        budget = self.config.sink_size + self.config.window_size
+        if len(sequence) <= budget:
+            return list(sequence)
+        return (
+            list(sequence[: self.config.sink_size])
+            + list(sequence[-self.config.window_size :])
+        )
+
+    def _assert_cache_invariant_1(self) -> None:
+        """ADR 0007 §2.9 INV-1: parallel-sequence consistency.
+
+        After every cache mutation, ``len(self.cached_token_sequence)``
+        must equal the K/V tensor sequence dimension. Violation
+        indicates a bug in the verifier's cache-mutation path; per ADR
+        0007 §2.9 the implementation must raise — never silently
+        recover, never fall back, never re-sync.
+        """
+        actual = len(self.cached_token_sequence)
+        expected = self._cache_buffer_size()
+        if actual != expected:
+            raise AssertionError(
+                f"INV-1 violated (parallel-sequence consistency): "
+                f"cached_token_sequence has {actual} entries but K/V "
+                f"cache seq dim is {expected}. This is a bug in the "
+                f"verifier's cache-mutation path; per ADR 0007 §2.9 it "
+                f"must surface as a critical error rather than be "
+                f"silently recovered. cache_logical_size="
+                f"{self.cache_logical_size}, "
+                f"next_global_position={self.next_global_position}."
+            )
+
     def live_kv_bytes(self) -> int:
         """Return the current size of the verifier's live KV cache in bytes.
 
diff --git a/kv_cache_proposer/verifier.py b/kv_cache_proposer/verifier.py
@@ -76,6 +76,23 @@ def __init__(self, config: Optional[VerifierConfig] = None) -> None:
         # Logits at `next_global_position` predicting the next token. Updated
         # after every forward pass.
         self.next_token_logits: Optional[torch.Tensor] = None
+        # Parallel record of the token id at every K/V cache slot, in the
+        # same physical order as ``self.cache.layers[*].keys``. Maintained
+        # synchronously with the K/V tensors by every cache mutation
+        # method below. Required by ADR 0007 §2.2 + §2.9 INV-1: the
+        # path-selection algorithm (PR 7-2) needs token-id-level prefix
+        # matching against the cache, and the K/V tensors don't expose
+        # token ids.
+        #
+        # Storage: at most ``sink_size + window_size`` int entries, so
+        # bounded at the same constant the K/V cache is bounded at
+        # (e.g. 68 entries × 8 bytes per Python int = 544 bytes,
+        # negligible vs the 7.4 MiB K/V).
+        #
+        # Invariant INV-1 (ADR 0007 §2.9): after every cache mutation,
+        # ``len(self.cached_token_sequence)`` equals the K/V tensor
+        # sequence dimension. Enforced by ``_assert_cache_invariant_1``.
+        self.cached_token_sequence: List[int] = []
 
         self.stats = VerifierStats(
             weight_bytes=sum(p.numel() * p.element_size() for p in self.model.parameters())
@@ -87,6 +104,8 @@ def reset(self) -> None:
         self.cache_logical_size = 0
         self.next_global_position = 0
         self.next_token_logits = None
+        self.cached_token_sequence = []
+        self._assert_cache_invariant_1()
 
     @torch.no_grad()
     def prefill(self, prompt_ids: List[int]) -> None:
@@ -112,11 +131,19 @@ def prefill(self, prompt_ids: List[int]) -> None:
         self.next_global_position = L
         self.next_token_logits = outputs.logits[0, -1].clone()
 
+        # Update parallel token sequence in lockstep with the K/V cache.
+        # After this prefill the cache holds K/V for all L tokens; the
+        # subsequent ``_trim_cache_in_place`` will drop middle entries
+        # to enforce sink+window. We mirror that exact transformation
+        # on ``cached_token_sequence``.
+        self.cached_token_sequence = list(prompt_ids)
+
         self._record_peak_activation(outputs.logits)
         self._trim_cache_in_place()
         self._record_peak_kv()
         self.stats.forward_calls += 1
         self.stats.tokens_consumed += L
+        self._assert_cache_invariant_1()
 
     @torch.no_grad()
     def forward_block(self, tokens: List[int]) -> torch.Tensor:
@@ -153,9 +180,13 @@ def forward_block(self, tokens: List[int]) -> torch.Tensor:
         self.cache = outputs.past_key_values
         # Cache provisionally has cache_start + L slots until commit/truncate.
         self.cache_logical_size = cache_start + L
+        # Mirror the provisional extension on the parallel sequence;
+        # commit_or_truncate will drop the unaccepted tail in lockstep.
+        self.cached_token_sequence = self.cached_token_sequence + list(tokens)
         self._record_peak_activation(outputs.logits)
         self.stats.forward_calls += 1
         self.stats.tokens_consumed += L
+        self._assert_cache_invariant_1()
         # Don't trim yet — caller decides how many tokens were accepted.
         return outputs.logits[0].clone()  # [L, V]
 
@@ -177,10 +208,13 @@ def commit_or_truncate(
         drop = forwarded - accepted
         if drop > 0:
             self._truncate_tail_in_place(drop)
+            # Mirror the tail truncation on the parallel sequence.
+            self.cached_token_sequence = self.cached_token_sequence[:-drop]
         self.cache_logical_size -= drop
         self.next_global_position += accepted
         self._trim_cache_in_place()
         self._record_peak_kv()
+        self._assert_cache_invariant_1()
 
     @torch.no_grad()
     def append_token(self, token_id: int) -> torch.Tensor:
@@ -226,6 +260,12 @@ def _trim_cache_in_place(self) -> None:
             # peak_kv_bytes would over-report.
             layer.keys = torch.cat([sink_k, tail_k], dim=2).contiguous()
             layer.values = torch.cat([sink_v, tail_v], dim=2).contiguous()
+        # Mirror the same sink+window slice on the parallel token sequence
+        # so cached_token_sequence stays in lockstep with the K/V tensors.
+        self.cached_token_sequence = (
+            self.cached_token_sequence[:sink]
+            + self.cached_token_sequence[-keep_window:]
+        )
         self.cache_logical_size = budget
 
     def _truncate_tail_in_place(self, drop: int) -> None:
@@ -246,6 +286,44 @@ def _truncate_tail_in_place(self, drop: int) -> None:
             layer.keys = keys[:, :, :keep, :].contiguous()
             layer.values = values[:, :, :keep, :].contiguous()
 
+    def _cache_seq_length(self) -> int:
+        """Return the seq dim of the cache K/V tensors, or 0 if empty.
+
+        Reads from the first non-empty layer; all layers share the same
+        seq dim by construction (every K/V mutation in this class
+        applies the same shape transformation across all layers).
+        """
+        if self.cache is None:
+            return 0
+        for layer in self.cache.layers:
+            keys = getattr(layer, "keys", None)
+            if keys is not None:
+                return int(keys.shape[2])
+        return 0
+
+    def _assert_cache_invariant_1(self) -> None:
+        """ADR 0007 §2.9 INV-1: parallel-sequence consistency.
+
+        After every cache mutation, ``len(self.cached_token_sequence)``
+        must equal the K/V tensor sequence dimension. Violation
+        indicates a bug in the cache-mutation path; per ADR 0007 §2.9
+        the implementation must raise — never silently recover, never
+        fall back, never re-sync.
+        """
+        actual = len(self.cached_token_sequence)
+        expected = self._cache_seq_length()
+        if actual != expected:
+            raise AssertionError(
+                f"INV-1 violated (parallel-sequence consistency): "
+                f"cached_token_sequence has {actual} entries but K/V "
+                f"cache seq dim is {expected}. This is a bug in the "
+                f"verifier's cache-mutation path; per ADR 0007 §2.9 it "
+                f"must surface as a critical error rather than be "
+                f"silently recovered. cache_logical_size="
+                f"{self.cache_logical_size}, "
+                f"next_global_position={self.next_global_position}."
+            )
+
     def live_kv_bytes(self) -> int:
         """Return the current size of the verifier's live KV cache in bytes.
 
diff --git a/tests/backends/mlx/test_verifier.py b/tests/backends/mlx/test_verifier.py
@@ -273,6 +273,122 @@ def test_live_kv_bytes_nonzero_after_prefill() -> None:
     assert v.stats.peak_kv_bytes == n
 
 
+# ---------------------------------------------------------------------------
+# ADR 0007 §2.2 + §2.9 — cached_token_sequence + INV-1
+# ---------------------------------------------------------------------------
+
+
+def test_mlx_cached_token_sequence_empty_after_construction() -> None:
+    v = _build_mlx_verifier()
+    assert v.cached_token_sequence == []
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_cached_token_sequence_populated_after_short_prefill() -> None:
+    v = _build_mlx_verifier(sink=2, window=8)
+    prompt = list(range(5))  # 5 < sink+window = 10
+    v.prefill(prompt)
+    assert v.cached_token_sequence == prompt
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_cached_token_sequence_trimmed_after_long_prefill() -> None:
+    v = _build_mlx_verifier(sink=2, window=4)
+    prompt = list(range(20))  # 20 > sink+window = 6
+    v.prefill(prompt)
+    expected = prompt[:2] + prompt[-4:]
+    assert v.cached_token_sequence == expected
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_cached_token_sequence_extends_on_forward_block() -> None:
+    """``forward_block`` extends the cache; the parallel sequence
+    extends in lockstep, then the same sink+window slice that the
+    K/V tensors apply is applied here too."""
+    v = _build_mlx_verifier(sink=2, window=8)
+    v.prefill([0, 1, 2, 3])
+    v.forward_block([4, 5])
+    # 6 entries, all under budget=10
+    assert v.cached_token_sequence == [0, 1, 2, 3, 4, 5]
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_cached_token_sequence_drops_rejected_tail_on_partial_accept() -> None:
+    v = _build_mlx_verifier(sink=2, window=8)
+    v.prefill([0, 1, 2, 3])
+    v.forward_block([4, 5, 6])
+    v.commit_or_truncate(forwarded=3, accepted=1)
+    assert v.cached_token_sequence == [0, 1, 2, 3, 4]
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_cached_token_sequence_after_append_token() -> None:
+    v = _build_mlx_verifier(sink=2, window=8)
+    v.prefill([0, 1, 2, 3])
+    v.append_token(99)
+    assert v.cached_token_sequence == [0, 1, 2, 3, 99]
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_cached_token_sequence_cleared_on_reset() -> None:
+    v = _build_mlx_verifier(sink=2, window=8)
+    v.prefill([0, 1, 2, 3])
+    assert v.cached_token_sequence != []
+    v.reset()
+    assert v.cached_token_sequence == []
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_inv_1_violation_raises_assertion_error() -> None:
+    v = _build_mlx_verifier(sink=2, window=8)
+    v.prefill([0, 1, 2, 3])
+    v.cached_token_sequence = v.cached_token_sequence + [999]
+    with pytest.raises(AssertionError, match="INV-1 violated"):
+        v._assert_cache_invariant_1()
+
+
+def test_mlx_inv_1_assertion_message_carries_diagnostic_state() -> None:
+    """The error message must expose actual vs expected lengths plus
+    the verifier's logical-position counters so a bug report can be
+    triaged from the message alone."""
+    v = _build_mlx_verifier()
+    v.prefill([0, 1, 2, 3])
+    v.cached_token_sequence = v.cached_token_sequence + [42, 43]
+    with pytest.raises(AssertionError) as exc:
+        v._assert_cache_invariant_1()
+    msg = str(exc.value)
+    assert "INV-1" in msg
+    assert "cached_token_sequence" in msg
+    assert "cache_logical_size=" in msg
+    assert "next_global_position=" in msg
+
+
+def test_mlx_inv_1_holds_when_cache_is_none() -> None:
+    """The pre-prefill state (cache None, sequence []) is the trivial
+    INV-1 satisfaction — must not raise."""
+    v = _build_mlx_verifier()
+    assert v.cache is None
+    assert v.cached_token_sequence == []
+    v._assert_cache_invariant_1()
+
+
+def test_mlx_sink_window_slice_below_budget_returns_input_unchanged() -> None:
+    """The internal helper short-circuits when sequence fits in
+    sink+window."""
+    v = _build_mlx_verifier(sink=2, window=4)
+    seq = [10, 20, 30]
+    out = v._sink_window_slice(seq)
+    assert out == seq
+    assert out is not seq  # returns a copy
+
+
+def test_mlx_sink_window_slice_above_budget_keeps_sink_plus_tail() -> None:
+    v = _build_mlx_verifier(sink=2, window=4)
+    seq = list(range(20))
+    out = v._sink_window_slice(seq)
+    assert out == seq[:2] + seq[-4:]
+
+
 def test_record_peak_activation_grows_only() -> None:
     v = _build_mlx_verifier()
     a = mx.zeros((1, 4, 32), dtype=mx.bfloat16)
diff --git a/tests/core/test_verifier.py b/tests/core/test_verifier.py