|
1 | | -"""Shared test doubles + fixtures for the HTTP server tests. |
2 | | -
|
3 | | -These are **real concrete classes** that satisfy the |
4 | | -:class:`~inference_engine.server.tokenizer.Tokenizer` and |
5 | | -:class:`~inference_engine.server.engine.Engine` protocols |
6 | | -structurally; they are not ``unittest.mock`` objects, and they do not |
7 | | -patch or wrap any production class. The "deterministic" qualifier |
8 | | -means their outputs are computed from constructor arguments rather |
9 | | -than from a real model, which is what makes route-level tests fast |
10 | | -and reproducible without HF cache. |
11 | | -
|
12 | | -The same doubles are used by: |
13 | | -
|
14 | | - * tests/inference_engine/server/test_app_routes.py |
15 | | - * tests/inference_engine/server/test_app_streaming.py |
16 | | - * tests/inference_engine/server/test_streaming.py |
17 | | -
|
18 | | -Tests of the *real* :class:`SpeculativeEngine` adapter live in |
19 | | -test_engine.py and use the codebase's existing test verifier / |
20 | | -proposer fakes from ``tests/conftest.py`` (also real concrete |
21 | | -classes — see kv_cache_proposer.speculative tests for precedent). |
| 1 | +"""Shared fixtures for server-side tests on Linux. |
| 2 | +
|
| 3 | +PR-N3 retired the previously-housed ``DeterministicEngine`` and |
| 4 | +``DeterministicTokenizer`` test doubles plus their fixtures |
| 5 | +(``tokenizer``, ``short_engine``, ``long_engine``). The HTTP shim's |
| 6 | +runtime tests moved to ``tests/integration/test_http_shim_real.py``; |
| 7 | +the engine + tokenizer wrapper tests moved to |
| 8 | +``tests/integration/test_engine_real.py`` and |
| 9 | +``tests/integration/test_tokenizer_real.py``. |
| 10 | +
|
| 11 | +What stays on Linux: the ``_reset_sse_starlette_app_status`` |
| 12 | +autouse fixture below, which fixes a sse-starlette / pytest-asyncio |
| 13 | +event-loop-binding interaction that would otherwise corrupt async |
| 14 | +streaming tests in the (still Linux-runnable) test_streaming.py. |
22 | 15 | """ |
23 | 16 |
|
24 | 17 | from __future__ import annotations |
25 | 18 |
|
26 | | -from typing import Any, Callable, List, Optional |
27 | | - |
28 | 19 | import pytest |
29 | 20 |
|
30 | | -from inference_engine.server.engine import EngineResult |
31 | | - |
32 | 21 |
|
33 | 22 | # --------------------------------------------------------------------------- |
34 | 23 | # sse-starlette compatibility shim |
@@ -73,193 +62,3 @@ def _reset_sse_starlette_app_status(): |
73 | 62 | finally: |
74 | 63 | AppStatus.should_exit_event = None |
75 | 64 | AppStatus.should_exit = False |
76 | | - |
77 | | - |
78 | | -class DeterministicTokenizer: |
79 | | - """Tiny deterministic tokenizer that maps words to integer ids. |
80 | | -
|
81 | | - Vocabulary: each unique word in any input becomes a fresh id. |
82 | | - Two reserved sentinel tokens are predefined so chat-template and |
83 | | - EOS resolution have something to work with: |
84 | | -
|
85 | | - id 0 -> ``<|im_end|>`` (also reported as eos_token_id) |
86 | | - id 1 -> ``<|unk|>`` (reported as unk_token_id) |
87 | | -
|
88 | | - ``apply_chat_template`` is implemented with a minimal but |
89 | | - deterministic format:: |
90 | | -
|
91 | | - ROLE: <role> |
92 | | - CONTENT: <content> |
93 | | - ... |
94 | | -
|
95 | | - flattened to whitespace-separated words and mapped through the |
96 | | - vocabulary. ``add_generation_prompt=True`` appends the literal |
97 | | - string ``"ASSISTANT:"``. This is sufficient for route-level tests |
98 | | - to exercise full request -> tokenize -> generate -> decode loops |
99 | | - without depending on transformers. |
100 | | - """ |
101 | | - |
102 | | - def __init__(self) -> None: |
103 | | - self._token_to_id: dict[str, int] = {"<|im_end|>": 0, "<|unk|>": 1} |
104 | | - self._id_to_token: dict[int, str] = {0: "<|im_end|>", 1: "<|unk|>"} |
105 | | - self.eos_token_id: Optional[int] = 0 |
106 | | - self.unk_token_id: Optional[int] = 1 |
107 | | - |
108 | | - def _intern(self, word: str) -> int: |
109 | | - if word not in self._token_to_id: |
110 | | - new_id = len(self._token_to_id) |
111 | | - self._token_to_id[word] = new_id |
112 | | - self._id_to_token[new_id] = word |
113 | | - return self._token_to_id[word] |
114 | | - |
115 | | - def apply_chat_template( |
116 | | - self, |
117 | | - messages: List[dict], |
118 | | - *, |
119 | | - add_generation_prompt: bool, |
120 | | - tokenize: bool, |
121 | | - return_dict: bool, |
122 | | - enable_thinking: bool = False, |
123 | | - ) -> Any: |
124 | | - if not tokenize or return_dict: |
125 | | - raise ValueError( |
126 | | - "DeterministicTokenizer only supports tokenize=True, return_dict=False" |
127 | | - ) |
128 | | - words: List[str] = [] |
129 | | - for msg in messages: |
130 | | - words.append(msg["role"].upper() + ":") |
131 | | - words.extend(msg["content"].split()) |
132 | | - if add_generation_prompt: |
133 | | - words.append("ASSISTANT:") |
134 | | - return [self._intern(w) for w in words] |
135 | | - |
136 | | - def decode(self, token_ids: List[int], *, skip_special_tokens: bool = False) -> str: |
137 | | - out: List[str] = [] |
138 | | - for tid in token_ids: |
139 | | - tok = self._id_to_token.get(int(tid), "<|unk|>") |
140 | | - if skip_special_tokens and tok in {"<|im_end|>", "<|unk|>"}: |
141 | | - continue |
142 | | - out.append(tok) |
143 | | - return " ".join(out) |
144 | | - |
145 | | - def convert_tokens_to_ids(self, token: str) -> Optional[int]: |
146 | | - return self._token_to_id.get(token) |
147 | | - |
148 | | - |
149 | | -class DeterministicEngine: |
150 | | - """Engine test double that emits a fixed token sequence. |
151 | | -
|
152 | | - Implements the :class:`~inference_engine.server.engine.Engine` |
153 | | - protocol structurally without subclassing it. The ``generate`` |
154 | | - method walks a pre-baked token sequence, invoking ``on_token`` per |
155 | | - committed token and respecting both ``max_new_tokens`` and the |
156 | | - EOS list. The engine therefore exercises every cancellation and |
157 | | - termination branch in the streaming layer without ever loading a |
158 | | - real model. |
159 | | -
|
160 | | - Special token ids: |
161 | | - * ``0`` is treated as ``<|im_end|>`` by the paired |
162 | | - DeterministicTokenizer; if it appears in ``fixed_tokens`` and |
163 | | - ``0 in eos_token_ids`` (the default), generation stops at it. |
164 | | - """ |
165 | | - |
166 | | - def __init__( |
167 | | - self, |
168 | | - fixed_tokens: List[int], |
169 | | - tokenizer: DeterministicTokenizer, |
170 | | - model_id_label: str = "kakeya-test", |
171 | | - per_token_delay_s: float = 0.0, |
172 | | - ) -> None: |
173 | | - if not fixed_tokens: |
174 | | - raise ValueError("fixed_tokens must be non-empty") |
175 | | - if not model_id_label.strip(): |
176 | | - raise ValueError("model_id_label must be non-empty") |
177 | | - if per_token_delay_s < 0: |
178 | | - raise ValueError("per_token_delay_s must be >= 0") |
179 | | - self._fixed_tokens = list(fixed_tokens) |
180 | | - self._tokenizer = tokenizer |
181 | | - self._model_id_label = model_id_label |
182 | | - self._per_token_delay_s = per_token_delay_s |
183 | | - |
184 | | - @property |
185 | | - def tokenizer(self) -> DeterministicTokenizer: |
186 | | - return self._tokenizer |
187 | | - |
188 | | - @property |
189 | | - def model_id_label(self) -> str: |
190 | | - return self._model_id_label |
191 | | - |
192 | | - def kv_state(self) -> int: |
193 | | - """Test double has no real KV cache — 0 by default. Tests that |
194 | | - want to drive a non-zero gauge value override this.""" |
195 | | - return 0 |
196 | | - |
197 | | - def generate( |
198 | | - self, |
199 | | - prompt_ids: List[int], |
200 | | - max_new_tokens: int, |
201 | | - eos_token_ids: List[int], |
202 | | - on_token: Optional[Callable[[int], bool]] = None, |
203 | | - ) -> EngineResult: |
204 | | - if not prompt_ids: |
205 | | - raise ValueError("prompt_ids must be non-empty") |
206 | | - if max_new_tokens <= 0: |
207 | | - raise ValueError(f"max_new_tokens must be positive, got {max_new_tokens}") |
208 | | - if not eos_token_ids: |
209 | | - raise ValueError("eos_token_ids must be non-empty") |
210 | | - eos_set = set(int(i) for i in eos_token_ids) |
211 | | - emitted: List[int] = [] |
212 | | - stopped_on_eos = False |
213 | | - for tok in self._fixed_tokens: |
214 | | - if len(emitted) >= max_new_tokens: |
215 | | - break |
216 | | - if self._per_token_delay_s > 0: # pragma: no cover - timing aid |
217 | | - import time |
218 | | - time.sleep(self._per_token_delay_s) |
219 | | - emitted.append(int(tok)) |
220 | | - if on_token is not None and on_token(int(tok)): |
221 | | - break |
222 | | - if int(tok) in eos_set: |
223 | | - stopped_on_eos = True |
224 | | - break |
225 | | - return EngineResult( |
226 | | - output_token_ids=emitted, |
227 | | - acceptance_rate=1.0, |
228 | | - proposer_forward_calls=len(emitted), |
229 | | - verifier_forward_calls=len(emitted), |
230 | | - stopped_on_eos=stopped_on_eos, |
231 | | - ) |
232 | | - |
233 | | - |
234 | | -@pytest.fixture |
235 | | -def tokenizer() -> DeterministicTokenizer: |
236 | | - return DeterministicTokenizer() |
237 | | - |
238 | | - |
239 | | -@pytest.fixture |
240 | | -def short_engine(tokenizer: DeterministicTokenizer) -> DeterministicEngine: |
241 | | - """Engine that emits 3 tokens then EOS.""" |
242 | | - # Pre-intern the words we want the tokens to decode to. |
243 | | - hello = tokenizer._intern("hello") |
244 | | - world = tokenizer._intern("world") |
245 | | - bang = tokenizer._intern("!") |
246 | | - eos = tokenizer.eos_token_id |
247 | | - assert eos is not None |
248 | | - return DeterministicEngine( |
249 | | - fixed_tokens=[hello, world, bang, eos], |
250 | | - tokenizer=tokenizer, |
251 | | - model_id_label="kakeya-test-short", |
252 | | - ) |
253 | | - |
254 | | - |
255 | | -@pytest.fixture |
256 | | -def long_engine(tokenizer: DeterministicTokenizer) -> DeterministicEngine: |
257 | | - """Engine that emits 50 tokens (no EOS in the sequence) — used to |
258 | | - exercise the ``max_tokens`` truncation path and disconnect-mid- |
259 | | - stream paths.""" |
260 | | - ids = [tokenizer._intern(f"tok{i}") for i in range(50)] |
261 | | - return DeterministicEngine( |
262 | | - fixed_tokens=ids, |
263 | | - tokenizer=tokenizer, |
264 | | - model_id_label="kakeya-test-long", |
265 | | - ) |
0 commit comments