Skip to content

Commit 36e5dab

Browse files
authored
Merge pull request #55 from FluffyAIcode/AgentMemory/v030-pr-n3-remove-http-shim-doubles-8e7f
PR-N3: remove HTTP-shim, engine, tokenizer test doubles
2 parents a621b20 + 055b74b commit 36e5dab

15 files changed

Lines changed: 696 additions & 2316 deletions

.github/workflows/ci.yaml

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,22 @@ jobs:
7272
# PYTHONPATH route avoids a setuptools build step in CI.
7373
PYTHONPATH: .:sdks/python
7474
run: |
75-
# PR-N2 (ADR 0008) cleanup: this gate covers ONLY
75+
# PR-N1/N2/N3 (ADR 0008) cleanup: this gate covers ONLY
7676
# verifier-independent code. The Linux runner cannot load
77-
# real Qwen3 weights; PR-N2 retired the DeterministicEngine
78-
# + DeterministicTokenizer test doubles that previously
79-
# stood in for them. Engine / scheduler runtime tests
80-
# moved to tests/integration/ (Mac M4 / CUDA gate).
77+
# real Qwen3 weights; the cleanup PRs retired the
78+
# FakeVerifier / DeterministicEngine / DeterministicTokenizer
79+
# test doubles. Engine-dependent modules — currently
80+
# ``inference_engine.session.coordinator``,
81+
# ``inference_engine.session.generator``,
82+
# ``inference_engine.scheduler.scheduler``,
83+
# ``inference_engine.server.app``,
84+
# ``inference_engine.server.engine``,
85+
# ``inference_engine.server.tokenizer`` — move to the
86+
# tests/integration/ suite, gated on Mac M4 / CUDA hosts.
8187
#
82-
# Coverage is invoked via ``coverage run -m pytest`` not
83-
# ``pytest --cov=`` to avoid a torch+pytest-cov race at
84-
# conftest-import time on the hosted Linux runner.
88+
# Coverage is invoked via ``coverage run -m pytest`` rather
89+
# than ``pytest --cov=`` to avoid a torch+pytest-cov race
90+
# at conftest-import time.
8591
coverage run -m pytest \
8692
tests/inference_engine/server/ \
8793
tests/inference_engine/memory/ \
@@ -95,10 +101,10 @@ jobs:
95101
--junitxml=junit.xml \
96102
-v
97103
coverage report \
98-
--include='inference_engine/server/*,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/*,training/repr_align/*' \
104+
--include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/*,training/repr_align/*' \
99105
--fail-under=100
100106
coverage xml -o coverage.xml \
101-
--include='inference_engine/server/*,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/*,training/repr_align/*'
107+
--include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/*,training/repr_align/*'
102108
103109
- name: Upload coverage artifact
104110
if: always()

scripts/review_pr_n3_on_mac.sh

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env bash
2+
# Mac M4 review aid for PR-N3 (no-test-doubles cleanup, scope =
3+
# HTTP shim + engine + tokenizer + streaming doubles).
4+
#
5+
# PR-N3 retired the largest cluster of test doubles in the Linux
6+
# tree:
7+
# - DeterministicEngine + DeterministicTokenizer in
8+
# tests/inference_engine/server/conftest.py
9+
# - Engine subtypes (_RaisingEngine, _ProxyEngine,
10+
# _AlwaysHoldingEngine, _KVAwareSlowEngine) in test_app_*.py
11+
# - Tokenizer subtypes (_BrokenTokenizer, _EmptyTemplateTokenizer,
12+
# _NoEosTokenizer) in test_app_*.py
13+
# - Verifier / decoder doubles (_VerifierDouble,
14+
# _LegacyVerifierDouble, _DecoderDouble, _DecoderResult) in
15+
# test_engine.py
16+
#
17+
# All HTTP-shim runtime tests, engine wrapper tests, tokenizer
18+
# wrapper tests, and streaming-detokenizer tests moved to
19+
# tests/integration/ where they run against the real
20+
# ``SpeculativeEngine`` over Qwen3-0.6B.
21+
#
22+
# Produces 1 artifact:
23+
#
24+
# results/platform-tests/pr-n3-mac-integration-tests-<unix>.json
25+
# pytest -m integration tests/integration/ — runs ALL integration
26+
# suites accumulated to date (PR-E1 INV-3 gate, PR-N1 coordinator
27+
# and generator, PR-N2 scheduler, PR-N3 http_shim + engine +
28+
# tokenizer + streaming).
29+
#
30+
# Usage (from repo root, on Mac M4):
31+
#
32+
# bash scripts/review_pr_n3_on_mac.sh
33+
#
34+
# Then commit:
35+
#
36+
# git add results/platform-tests/pr-n3-mac-*
37+
# git commit -m "Mac M4 review evidence for PR-N3"
38+
# git push
39+
40+
set -euo pipefail
41+
42+
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
43+
cd "$ROOT"
44+
45+
stamp="$(date +%s)"
46+
out_dir="results/platform-tests"
47+
mkdir -p "$out_dir"
48+
49+
junit="$out_dir/pr-n3-mac-integration-tests-${stamp}.junit.xml"
50+
report="$out_dir/pr-n3-mac-integration-tests-${stamp}.json"
51+
52+
echo "==> integration suite (all PR-N1/N2/N3 migrated tests + INV-3 GA gate)"
53+
PYTHONPATH=.:sdks/python python3 -m pytest \
54+
-m integration \
55+
tests/integration/ \
56+
--junitxml="$junit" \
57+
-v
58+
59+
PYTHONPATH=.:sdks/python python3 - "$junit" "$report" <<'PY'
60+
import json
61+
import platform
62+
import sys
63+
import xml.etree.ElementTree as ET
64+
junit_path, out_path = sys.argv[1:3]
65+
jr = ET.parse(junit_path).getroot()
66+
testsuites = list(jr.iter("testsuite"))
67+
total_tests = sum(int(ts.get("tests", "0")) for ts in testsuites)
68+
total_failures = sum(int(ts.get("failures", "0")) for ts in testsuites)
69+
total_errors = sum(int(ts.get("errors", "0")) for ts in testsuites)
70+
total_skipped = sum(int(ts.get("skipped", "0")) for ts in testsuites)
71+
report = {
72+
"schema_version": 1,
73+
"kind": "pr_n3_mac_integration_tests",
74+
"host": {
75+
"platform": platform.platform(),
76+
"machine": platform.machine(),
77+
"python": platform.python_version(),
78+
},
79+
"junit": {
80+
"tests": total_tests, "failures": total_failures,
81+
"errors": total_errors, "skipped": total_skipped,
82+
},
83+
}
84+
with open(out_path, "w", encoding="utf-8") as fh:
85+
json.dump(report, fh, indent=2)
86+
print(f" -> {out_path}")
87+
PY
88+
89+
echo
90+
echo "==> Done. Commit:"
91+
echo " git add $out_dir/pr-n3-mac-*"
92+
echo " git commit -m 'Mac M4 review evidence for PR-N3'"
93+
echo " git push"
Lines changed: 14 additions & 215 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,23 @@
1-
"""Shared test doubles + fixtures for the HTTP server tests.
2-
3-
These are **real concrete classes** that satisfy the
4-
:class:`~inference_engine.server.tokenizer.Tokenizer` and
5-
:class:`~inference_engine.server.engine.Engine` protocols
6-
structurally; they are not ``unittest.mock`` objects, and they do not
7-
patch or wrap any production class. The "deterministic" qualifier
8-
means their outputs are computed from constructor arguments rather
9-
than from a real model, which is what makes route-level tests fast
10-
and reproducible without HF cache.
11-
12-
The same doubles are used by:
13-
14-
* tests/inference_engine/server/test_app_routes.py
15-
* tests/inference_engine/server/test_app_streaming.py
16-
* tests/inference_engine/server/test_streaming.py
17-
18-
Tests of the *real* :class:`SpeculativeEngine` adapter live in
19-
test_engine.py and use the codebase's existing test verifier /
20-
proposer fakes from ``tests/conftest.py`` (also real concrete
21-
classes — see kv_cache_proposer.speculative tests for precedent).
1+
"""Shared fixtures for server-side tests on Linux.
2+
3+
PR-N3 retired the previously-housed ``DeterministicEngine`` and
4+
``DeterministicTokenizer`` test doubles plus their fixtures
5+
(``tokenizer``, ``short_engine``, ``long_engine``). The HTTP shim's
6+
runtime tests moved to ``tests/integration/test_http_shim_real.py``;
7+
the engine + tokenizer wrapper tests moved to
8+
``tests/integration/test_engine_real.py`` and
9+
``tests/integration/test_tokenizer_real.py``.
10+
11+
What stays on Linux: the ``_reset_sse_starlette_app_status``
12+
autouse fixture below, which fixes a sse-starlette / pytest-asyncio
13+
event-loop-binding interaction that would otherwise corrupt async
14+
streaming tests in the (still Linux-runnable) test_streaming.py.
2215
"""
2316

2417
from __future__ import annotations
2518

26-
from typing import Any, Callable, List, Optional
27-
2819
import pytest
2920

30-
from inference_engine.server.engine import EngineResult
31-
3221

3322
# ---------------------------------------------------------------------------
3423
# sse-starlette compatibility shim
@@ -73,193 +62,3 @@ def _reset_sse_starlette_app_status():
7362
finally:
7463
AppStatus.should_exit_event = None
7564
AppStatus.should_exit = False
76-
77-
78-
class DeterministicTokenizer:
79-
"""Tiny deterministic tokenizer that maps words to integer ids.
80-
81-
Vocabulary: each unique word in any input becomes a fresh id.
82-
Two reserved sentinel tokens are predefined so chat-template and
83-
EOS resolution have something to work with:
84-
85-
id 0 -> ``<|im_end|>`` (also reported as eos_token_id)
86-
id 1 -> ``<|unk|>`` (reported as unk_token_id)
87-
88-
``apply_chat_template`` is implemented with a minimal but
89-
deterministic format::
90-
91-
ROLE: <role>
92-
CONTENT: <content>
93-
...
94-
95-
flattened to whitespace-separated words and mapped through the
96-
vocabulary. ``add_generation_prompt=True`` appends the literal
97-
string ``"ASSISTANT:"``. This is sufficient for route-level tests
98-
to exercise full request -> tokenize -> generate -> decode loops
99-
without depending on transformers.
100-
"""
101-
102-
def __init__(self) -> None:
103-
self._token_to_id: dict[str, int] = {"<|im_end|>": 0, "<|unk|>": 1}
104-
self._id_to_token: dict[int, str] = {0: "<|im_end|>", 1: "<|unk|>"}
105-
self.eos_token_id: Optional[int] = 0
106-
self.unk_token_id: Optional[int] = 1
107-
108-
def _intern(self, word: str) -> int:
109-
if word not in self._token_to_id:
110-
new_id = len(self._token_to_id)
111-
self._token_to_id[word] = new_id
112-
self._id_to_token[new_id] = word
113-
return self._token_to_id[word]
114-
115-
def apply_chat_template(
116-
self,
117-
messages: List[dict],
118-
*,
119-
add_generation_prompt: bool,
120-
tokenize: bool,
121-
return_dict: bool,
122-
enable_thinking: bool = False,
123-
) -> Any:
124-
if not tokenize or return_dict:
125-
raise ValueError(
126-
"DeterministicTokenizer only supports tokenize=True, return_dict=False"
127-
)
128-
words: List[str] = []
129-
for msg in messages:
130-
words.append(msg["role"].upper() + ":")
131-
words.extend(msg["content"].split())
132-
if add_generation_prompt:
133-
words.append("ASSISTANT:")
134-
return [self._intern(w) for w in words]
135-
136-
def decode(self, token_ids: List[int], *, skip_special_tokens: bool = False) -> str:
137-
out: List[str] = []
138-
for tid in token_ids:
139-
tok = self._id_to_token.get(int(tid), "<|unk|>")
140-
if skip_special_tokens and tok in {"<|im_end|>", "<|unk|>"}:
141-
continue
142-
out.append(tok)
143-
return " ".join(out)
144-
145-
def convert_tokens_to_ids(self, token: str) -> Optional[int]:
146-
return self._token_to_id.get(token)
147-
148-
149-
class DeterministicEngine:
150-
"""Engine test double that emits a fixed token sequence.
151-
152-
Implements the :class:`~inference_engine.server.engine.Engine`
153-
protocol structurally without subclassing it. The ``generate``
154-
method walks a pre-baked token sequence, invoking ``on_token`` per
155-
committed token and respecting both ``max_new_tokens`` and the
156-
EOS list. The engine therefore exercises every cancellation and
157-
termination branch in the streaming layer without ever loading a
158-
real model.
159-
160-
Special token ids:
161-
* ``0`` is treated as ``<|im_end|>`` by the paired
162-
DeterministicTokenizer; if it appears in ``fixed_tokens`` and
163-
``0 in eos_token_ids`` (the default), generation stops at it.
164-
"""
165-
166-
def __init__(
167-
self,
168-
fixed_tokens: List[int],
169-
tokenizer: DeterministicTokenizer,
170-
model_id_label: str = "kakeya-test",
171-
per_token_delay_s: float = 0.0,
172-
) -> None:
173-
if not fixed_tokens:
174-
raise ValueError("fixed_tokens must be non-empty")
175-
if not model_id_label.strip():
176-
raise ValueError("model_id_label must be non-empty")
177-
if per_token_delay_s < 0:
178-
raise ValueError("per_token_delay_s must be >= 0")
179-
self._fixed_tokens = list(fixed_tokens)
180-
self._tokenizer = tokenizer
181-
self._model_id_label = model_id_label
182-
self._per_token_delay_s = per_token_delay_s
183-
184-
@property
185-
def tokenizer(self) -> DeterministicTokenizer:
186-
return self._tokenizer
187-
188-
@property
189-
def model_id_label(self) -> str:
190-
return self._model_id_label
191-
192-
def kv_state(self) -> int:
193-
"""Test double has no real KV cache — 0 by default. Tests that
194-
want to drive a non-zero gauge value override this."""
195-
return 0
196-
197-
def generate(
198-
self,
199-
prompt_ids: List[int],
200-
max_new_tokens: int,
201-
eos_token_ids: List[int],
202-
on_token: Optional[Callable[[int], bool]] = None,
203-
) -> EngineResult:
204-
if not prompt_ids:
205-
raise ValueError("prompt_ids must be non-empty")
206-
if max_new_tokens <= 0:
207-
raise ValueError(f"max_new_tokens must be positive, got {max_new_tokens}")
208-
if not eos_token_ids:
209-
raise ValueError("eos_token_ids must be non-empty")
210-
eos_set = set(int(i) for i in eos_token_ids)
211-
emitted: List[int] = []
212-
stopped_on_eos = False
213-
for tok in self._fixed_tokens:
214-
if len(emitted) >= max_new_tokens:
215-
break
216-
if self._per_token_delay_s > 0: # pragma: no cover - timing aid
217-
import time
218-
time.sleep(self._per_token_delay_s)
219-
emitted.append(int(tok))
220-
if on_token is not None and on_token(int(tok)):
221-
break
222-
if int(tok) in eos_set:
223-
stopped_on_eos = True
224-
break
225-
return EngineResult(
226-
output_token_ids=emitted,
227-
acceptance_rate=1.0,
228-
proposer_forward_calls=len(emitted),
229-
verifier_forward_calls=len(emitted),
230-
stopped_on_eos=stopped_on_eos,
231-
)
232-
233-
234-
@pytest.fixture
235-
def tokenizer() -> DeterministicTokenizer:
236-
return DeterministicTokenizer()
237-
238-
239-
@pytest.fixture
240-
def short_engine(tokenizer: DeterministicTokenizer) -> DeterministicEngine:
241-
"""Engine that emits 3 tokens then EOS."""
242-
# Pre-intern the words we want the tokens to decode to.
243-
hello = tokenizer._intern("hello")
244-
world = tokenizer._intern("world")
245-
bang = tokenizer._intern("!")
246-
eos = tokenizer.eos_token_id
247-
assert eos is not None
248-
return DeterministicEngine(
249-
fixed_tokens=[hello, world, bang, eos],
250-
tokenizer=tokenizer,
251-
model_id_label="kakeya-test-short",
252-
)
253-
254-
255-
@pytest.fixture
256-
def long_engine(tokenizer: DeterministicTokenizer) -> DeterministicEngine:
257-
"""Engine that emits 50 tokens (no EOS in the sequence) — used to
258-
exercise the ``max_tokens`` truncation path and disconnect-mid-
259-
stream paths."""
260-
ids = [tokenizer._intern(f"tok{i}") for i in range(50)]
261-
return DeterministicEngine(
262-
fixed_tokens=ids,
263-
tokenizer=tokenizer,
264-
model_id_label="kakeya-test-long",
265-
)

0 commit comments

Comments
 (0)