FluffyAIcode
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 22 additions & 11 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎scripts/review_pr_n3_on_mac.sh‎
Lines changed: 93 additions & 0 deletions b/‎scripts/review_pr_n3_on_mac.sh‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎tests/inference_engine/server/conftest.py‎
Lines changed: 14 additions & 215 deletions b/‎tests/inference_engine/server/conftest.py‎
Lines changed: 14 additions & 215 deletions
@@ -72,7 +72,23 @@ jobs:
           # PYTHONPATH route avoids a setuptools build step in CI.
           PYTHONPATH: .:sdks/python
         run: |
-          pytest \
+          # PR-N1/N2/N3 (ADR 0008) cleanup: this gate covers ONLY
+          # verifier-independent code. The Linux runner cannot load
+          # real Qwen3 weights; the cleanup PRs retired the
+          # FakeVerifier / DeterministicEngine / DeterministicTokenizer
+          # test doubles. Engine-dependent modules — currently
+          # ``inference_engine.session.coordinator``,
+          # ``inference_engine.session.generator``,
+          # ``inference_engine.scheduler.scheduler``,
+          # ``inference_engine.server.app``,
+          # ``inference_engine.server.engine``,
+          # ``inference_engine.server.tokenizer`` — move to the
+          # tests/integration/ suite, gated on Mac M4 / CUDA hosts.
+          #
+          # Coverage is invoked via ``coverage run -m pytest`` rather
+          # than ``pytest --cov=`` to avoid a torch+pytest-cov race
+          # at conftest-import time.
+          coverage run -m pytest \
             tests/inference_engine/server/ \
             tests/inference_engine/memory/ \
             tests/inference_engine/scheduler/ \
@@ -81,18 +97,13 @@ jobs:
             tests/sdk/python/ \
             tests/training/repr_align/ \
             tests/backends/mlx/test_env.py \
-            --cov=inference_engine.server \
-            --cov=inference_engine.memory \
-            --cov=inference_engine.scheduler \
-            --cov=inference_engine.pipeline \
-            --cov=inference_engine.session \
-            --cov=kakeya \
-            --cov=training.repr_align \
-            --cov-report=term \
-            --cov-report=xml:coverage.xml \
-            --cov-fail-under=100 \
             --junitxml=junit.xml \
             -v
+          coverage report \
+            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/*,training/repr_align/*' \
+            --fail-under=100
+          coverage xml -o coverage.xml \
+            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/*,training/repr_align/*'
 
       - name: Upload coverage artifact
         if: always()
 
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Mac M4 review aid for PR-N3 (no-test-doubles cleanup, scope =
+# HTTP shim + engine + tokenizer + streaming doubles).
+#
+# PR-N3 retired the largest cluster of test doubles in the Linux
+# tree:
+#   - DeterministicEngine + DeterministicTokenizer in
+#     tests/inference_engine/server/conftest.py
+#   - Engine subtypes (_RaisingEngine, _ProxyEngine,
+#     _AlwaysHoldingEngine, _KVAwareSlowEngine) in test_app_*.py
+#   - Tokenizer subtypes (_BrokenTokenizer, _EmptyTemplateTokenizer,
+#     _NoEosTokenizer) in test_app_*.py
+#   - Verifier / decoder doubles (_VerifierDouble,
+#     _LegacyVerifierDouble, _DecoderDouble, _DecoderResult) in
+#     test_engine.py
+#
+# All HTTP-shim runtime tests, engine wrapper tests, tokenizer
+# wrapper tests, and streaming-detokenizer tests moved to
+# tests/integration/ where they run against the real
+# ``SpeculativeEngine`` over Qwen3-0.6B.
+#
+# Produces 1 artifact:
+#
+#   results/platform-tests/pr-n3-mac-integration-tests-<unix>.json
+#     pytest -m integration tests/integration/ — runs ALL integration
+#     suites accumulated to date (PR-E1 INV-3 gate, PR-N1 coordinator
+#     and generator, PR-N2 scheduler, PR-N3 http_shim + engine +
+#     tokenizer + streaming).
+#
+# Usage (from repo root, on Mac M4):
+#
+#     bash scripts/review_pr_n3_on_mac.sh
+#
+# Then commit:
+#
+#     git add results/platform-tests/pr-n3-mac-*
+#     git commit -m "Mac M4 review evidence for PR-N3"
+#     git push
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+stamp="$(date +%s)"
+out_dir="results/platform-tests"
+mkdir -p "$out_dir"
+
+junit="$out_dir/pr-n3-mac-integration-tests-${stamp}.junit.xml"
+report="$out_dir/pr-n3-mac-integration-tests-${stamp}.json"
+
+echo "==> integration suite (all PR-N1/N2/N3 migrated tests + INV-3 GA gate)"
+PYTHONPATH=.:sdks/python python3 -m pytest \
+    -m integration \
+    tests/integration/ \
+    --junitxml="$junit" \
+    -v
+
+PYTHONPATH=.:sdks/python python3 - "$junit" "$report" <<'PY'
+import json
+import platform
+import sys
+import xml.etree.ElementTree as ET
+junit_path, out_path = sys.argv[1:3]
+jr = ET.parse(junit_path).getroot()
+testsuites = list(jr.iter("testsuite"))
+total_tests = sum(int(ts.get("tests", "0")) for ts in testsuites)
+total_failures = sum(int(ts.get("failures", "0")) for ts in testsuites)
+total_errors = sum(int(ts.get("errors", "0")) for ts in testsuites)
+total_skipped = sum(int(ts.get("skipped", "0")) for ts in testsuites)
+report = {
+    "schema_version": 1,
+    "kind": "pr_n3_mac_integration_tests",
+    "host": {
+        "platform": platform.platform(),
+        "machine": platform.machine(),
+        "python": platform.python_version(),
+    },
+    "junit": {
+        "tests": total_tests, "failures": total_failures,
+        "errors": total_errors, "skipped": total_skipped,
+    },
+}
+with open(out_path, "w", encoding="utf-8") as fh:
+    json.dump(report, fh, indent=2)
+print(f"  -> {out_path}")
+PY
+
+echo
+echo "==> Done. Commit:"
+echo "    git add $out_dir/pr-n3-mac-*"
+echo "    git commit -m 'Mac M4 review evidence for PR-N3'"
+echo "    git push"
@@ -1,34 +1,23 @@
-"""Shared test doubles + fixtures for the HTTP server tests.
-
-These are **real concrete classes** that satisfy the
-:class:`~inference_engine.server.tokenizer.Tokenizer` and
-:class:`~inference_engine.server.engine.Engine` protocols
-structurally; they are not ``unittest.mock`` objects, and they do not
-patch or wrap any production class. The "deterministic" qualifier
-means their outputs are computed from constructor arguments rather
-than from a real model, which is what makes route-level tests fast
-and reproducible without HF cache.
-
-The same doubles are used by:
-
-  * tests/inference_engine/server/test_app_routes.py
-  * tests/inference_engine/server/test_app_streaming.py
-  * tests/inference_engine/server/test_streaming.py
-
-Tests of the *real* :class:`SpeculativeEngine` adapter live in
-test_engine.py and use the codebase's existing test verifier /
-proposer fakes from ``tests/conftest.py`` (also real concrete
-classes — see kv_cache_proposer.speculative tests for precedent).
+"""Shared fixtures for server-side tests on Linux.
+
+PR-N3 retired the previously-housed ``DeterministicEngine`` and
+``DeterministicTokenizer`` test doubles plus their fixtures
+(``tokenizer``, ``short_engine``, ``long_engine``). The HTTP shim's
+runtime tests moved to ``tests/integration/test_http_shim_real.py``;
+the engine + tokenizer wrapper tests moved to
+``tests/integration/test_engine_real.py`` and
+``tests/integration/test_tokenizer_real.py``.
+
+What stays on Linux: the ``_reset_sse_starlette_app_status``
+autouse fixture below, which fixes a sse-starlette / pytest-asyncio
+event-loop-binding interaction that would otherwise corrupt async
+streaming tests in the (still Linux-runnable) test_streaming.py.
 """
 
 from __future__ import annotations
 
-from typing import Any, Callable, List, Optional
-
 import pytest
 
-from inference_engine.server.engine import EngineResult
-
 
 # ---------------------------------------------------------------------------
 # sse-starlette compatibility shim
@@ -73,193 +62,3 @@ def _reset_sse_starlette_app_status():
     finally:
         AppStatus.should_exit_event = None
         AppStatus.should_exit = False
-
-
-class DeterministicTokenizer:
-    """Tiny deterministic tokenizer that maps words to integer ids.
-
-    Vocabulary: each unique word in any input becomes a fresh id.
-    Two reserved sentinel tokens are predefined so chat-template and
-    EOS resolution have something to work with:
-
-        id 0  -> ``<|im_end|>``     (also reported as eos_token_id)
-        id 1  -> ``<|unk|>``        (reported as unk_token_id)
-
-    ``apply_chat_template`` is implemented with a minimal but
-    deterministic format::
-
-        ROLE: <role>
-        CONTENT: <content>
-        ...
-
-    flattened to whitespace-separated words and mapped through the
-    vocabulary. ``add_generation_prompt=True`` appends the literal
-    string ``"ASSISTANT:"``. This is sufficient for route-level tests
-    to exercise full request -> tokenize -> generate -> decode loops
-    without depending on transformers.
-    """
-
-    def __init__(self) -> None:
-        self._token_to_id: dict[str, int] = {"<|im_end|>": 0, "<|unk|>": 1}
-        self._id_to_token: dict[int, str] = {0: "<|im_end|>", 1: "<|unk|>"}
-        self.eos_token_id: Optional[int] = 0
-        self.unk_token_id: Optional[int] = 1
-
-    def _intern(self, word: str) -> int:
-        if word not in self._token_to_id:
-            new_id = len(self._token_to_id)
-            self._token_to_id[word] = new_id
-            self._id_to_token[new_id] = word
-        return self._token_to_id[word]
-
-    def apply_chat_template(
-        self,
-        messages: List[dict],
-        *,
-        add_generation_prompt: bool,
-        tokenize: bool,
-        return_dict: bool,
-        enable_thinking: bool = False,
-    ) -> Any:
-        if not tokenize or return_dict:
-            raise ValueError(
-                "DeterministicTokenizer only supports tokenize=True, return_dict=False"
-            )
-        words: List[str] = []
-        for msg in messages:
-            words.append(msg["role"].upper() + ":")
-            words.extend(msg["content"].split())
-        if add_generation_prompt:
-            words.append("ASSISTANT:")
-        return [self._intern(w) for w in words]
-
-    def decode(self, token_ids: List[int], *, skip_special_tokens: bool = False) -> str:
-        out: List[str] = []
-        for tid in token_ids:
-            tok = self._id_to_token.get(int(tid), "<|unk|>")
-            if skip_special_tokens and tok in {"<|im_end|>", "<|unk|>"}:
-                continue
-            out.append(tok)
-        return " ".join(out)
-
-    def convert_tokens_to_ids(self, token: str) -> Optional[int]:
-        return self._token_to_id.get(token)
-
-
-class DeterministicEngine:
-    """Engine test double that emits a fixed token sequence.
-
-    Implements the :class:`~inference_engine.server.engine.Engine`
-    protocol structurally without subclassing it. The ``generate``
-    method walks a pre-baked token sequence, invoking ``on_token`` per
-    committed token and respecting both ``max_new_tokens`` and the
-    EOS list. The engine therefore exercises every cancellation and
-    termination branch in the streaming layer without ever loading a
-    real model.
-
-    Special token ids:
-      * ``0`` is treated as ``<|im_end|>`` by the paired
-        DeterministicTokenizer; if it appears in ``fixed_tokens`` and
-        ``0 in eos_token_ids`` (the default), generation stops at it.
-    """
-
-    def __init__(
-        self,
-        fixed_tokens: List[int],
-        tokenizer: DeterministicTokenizer,
-        model_id_label: str = "kakeya-test",
-        per_token_delay_s: float = 0.0,
-    ) -> None:
-        if not fixed_tokens:
-            raise ValueError("fixed_tokens must be non-empty")
-        if not model_id_label.strip():
-            raise ValueError("model_id_label must be non-empty")
-        if per_token_delay_s < 0:
-            raise ValueError("per_token_delay_s must be >= 0")
-        self._fixed_tokens = list(fixed_tokens)
-        self._tokenizer = tokenizer
-        self._model_id_label = model_id_label
-        self._per_token_delay_s = per_token_delay_s
-
-    @property
-    def tokenizer(self) -> DeterministicTokenizer:
-        return self._tokenizer
-
-    @property
-    def model_id_label(self) -> str:
-        return self._model_id_label
-
-    def kv_state(self) -> int:
-        """Test double has no real KV cache — 0 by default. Tests that
-        want to drive a non-zero gauge value override this."""
-        return 0
-
-    def generate(
-        self,
-        prompt_ids: List[int],
-        max_new_tokens: int,
-        eos_token_ids: List[int],
-        on_token: Optional[Callable[[int], bool]] = None,
-    ) -> EngineResult:
-        if not prompt_ids:
-            raise ValueError("prompt_ids must be non-empty")
-        if max_new_tokens <= 0:
-            raise ValueError(f"max_new_tokens must be positive, got {max_new_tokens}")
-        if not eos_token_ids:
-            raise ValueError("eos_token_ids must be non-empty")
-        eos_set = set(int(i) for i in eos_token_ids)
-        emitted: List[int] = []
-        stopped_on_eos = False
-        for tok in self._fixed_tokens:
-            if len(emitted) >= max_new_tokens:
-                break
-            if self._per_token_delay_s > 0:  # pragma: no cover - timing aid
-                import time
-                time.sleep(self._per_token_delay_s)
-            emitted.append(int(tok))
-            if on_token is not None and on_token(int(tok)):
-                break
-            if int(tok) in eos_set:
-                stopped_on_eos = True
-                break
-        return EngineResult(
-            output_token_ids=emitted,
-            acceptance_rate=1.0,
-            proposer_forward_calls=len(emitted),
-            verifier_forward_calls=len(emitted),
-            stopped_on_eos=stopped_on_eos,
-        )
-
-
-@pytest.fixture
-def tokenizer() -> DeterministicTokenizer:
-    return DeterministicTokenizer()
-
-
-@pytest.fixture
-def short_engine(tokenizer: DeterministicTokenizer) -> DeterministicEngine:
-    """Engine that emits 3 tokens then EOS."""
-    # Pre-intern the words we want the tokens to decode to.
-    hello = tokenizer._intern("hello")
-    world = tokenizer._intern("world")
-    bang = tokenizer._intern("!")
-    eos = tokenizer.eos_token_id
-    assert eos is not None
-    return DeterministicEngine(
-        fixed_tokens=[hello, world, bang, eos],
-        tokenizer=tokenizer,
-        model_id_label="kakeya-test-short",
-    )
-
-
-@pytest.fixture
-def long_engine(tokenizer: DeterministicTokenizer) -> DeterministicEngine:
-    """Engine that emits 50 tokens (no EOS in the sequence) — used to
-    exercise the ``max_tokens`` truncation path and disconnect-mid-
-    stream paths."""
-    ids = [tokenizer._intern(f"tok{i}") for i in range(50)]
-    return DeterministicEngine(
-        fixed_tokens=ids,
-        tokenizer=tokenizer,
-        model_id_label="kakeya-test-long",
-    )