FluffyAIcode
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 21 additions & 11 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 21 additions & 11 deletions
diff --git a/‎scripts/review_pr_n4_on_mac.sh‎
Lines changed: 86 additions & 0 deletions b/‎scripts/review_pr_n4_on_mac.sh‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎tests/integration/__init__.py‎ b/‎tests/integration/__init__.py‎
diff --git a/‎tests/integration/conftest.py‎
Lines changed: 170 additions & 0 deletions b/‎tests/integration/conftest.py‎
Lines changed: 170 additions & 0 deletions
@@ -72,7 +72,22 @@ jobs:
           # PYTHONPATH route avoids a setuptools build step in CI.
           PYTHONPATH: .:sdks/python
         run: |
-          pytest \
+          # PR-N1/N2/N3/N4 (ADR 0008) cleanup: this gate covers ONLY
+          # verifier-independent code. The Linux runner cannot load
+          # real Qwen3 weights; the cleanup PRs retired the
+          # FakeVerifier / DeterministicEngine / DeterministicTokenizer
+          # / _MinimalVerifierStub test doubles. Verifier-dependent
+          # modules — ``inference_engine.session.coordinator``,
+          # ``inference_engine.session.generator``,
+          # ``inference_engine.scheduler.scheduler``,
+          # ``inference_engine.server.{app, engine, tokenizer, streaming}``,
+          # ``kakeya.{client, session}`` — move to the
+          # tests/integration/ suite, gated on Mac M4 / CUDA hosts.
+          #
+          # Coverage is invoked via ``coverage run -m pytest`` rather
+          # than ``pytest --cov=`` to avoid a torch+pytest-cov race
+          # at conftest-import time on the hosted Linux runner.
+          coverage run -m pytest \
             tests/inference_engine/server/ \
             tests/inference_engine/memory/ \
             tests/inference_engine/scheduler/ \
@@ -81,18 +96,13 @@ jobs:
             tests/sdk/python/ \
             tests/training/repr_align/ \
             tests/backends/mlx/test_env.py \
-            --cov=inference_engine.server \
-            --cov=inference_engine.memory \
-            --cov=inference_engine.scheduler \
-            --cov=inference_engine.pipeline \
-            --cov=inference_engine.session \
-            --cov=kakeya \
-            --cov=training.repr_align \
-            --cov-report=term \
-            --cov-report=xml:coverage.xml \
-            --cov-fail-under=100 \
             --junitxml=junit.xml \
             -v
+          coverage report \
+            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*' \
+            --fail-under=100
+          coverage xml -o coverage.xml \
+            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/scheduler/pooled_verifier.py,inference_engine/pipeline/*,inference_engine/session/store.py,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*'
 
       - name: Upload coverage artifact
         if: always()
 
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# Mac M4 review aid for PR-N4 (no-test-doubles cleanup, FINAL).
+#
+# PR-N4 retires the last verifier-protocol stand-in: the
+# ``_MinimalVerifierStub`` (formerly ``FakeVerifier`` import) in
+# ``tests/sdk/python/conftest.py``. The SDK transport tests
+# (Client + Session) move to ``tests/integration/test_sdk_real.py``
+# where they run against a real Qwen3-0.6B-backed gRPC runtime.
+#
+# After PR-N4: NO test doubles remain in the Linux test tree
+# implementing the verifier / engine / tokenizer protocols. The
+# Linux CI gate covers ONLY truly verifier-independent code; the
+# integration suite is the binding gate for runtime correctness.
+#
+# Produces 1 artifact:
+#
+#   results/platform-tests/pr-n4-mac-integration-tests-<unix>.json
+#     pytest -m integration tests/integration/ — runs the full
+#     accumulated integration suite (PR-E1 INV-3 + PR-N1 coordinator/
+#     generator + PR-N2 scheduler + PR-N3 http_shim/engine/tokenizer/
+#     streaming + PR-N4 SDK).
+#
+# Usage (from repo root, on Mac M4):
+#
+#     bash scripts/review_pr_n4_on_mac.sh
+#
+# Then commit:
+#
+#     git add results/platform-tests/pr-n4-mac-*
+#     git commit -m "Mac M4 review evidence for PR-N4"
+#     git push
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+stamp="$(date +%s)"
+out_dir="results/platform-tests"
+mkdir -p "$out_dir"
+
+junit="$out_dir/pr-n4-mac-integration-tests-${stamp}.junit.xml"
+report="$out_dir/pr-n4-mac-integration-tests-${stamp}.json"
+
+echo "==> integration suite (full accumulated PR-N1..N4 + PR-E1 GA gate)"
+PYTHONPATH=.:sdks/python python3 -m pytest \
+    -m integration \
+    tests/integration/ \
+    --junitxml="$junit" \
+    -v
+
+PYTHONPATH=.:sdks/python python3 - "$junit" "$report" <<'PY'
+import json
+import platform
+import sys
+import xml.etree.ElementTree as ET
+junit_path, out_path = sys.argv[1:3]
+jr = ET.parse(junit_path).getroot()
+testsuites = list(jr.iter("testsuite"))
+total_tests = sum(int(ts.get("tests", "0")) for ts in testsuites)
+total_failures = sum(int(ts.get("failures", "0")) for ts in testsuites)
+total_errors = sum(int(ts.get("errors", "0")) for ts in testsuites)
+total_skipped = sum(int(ts.get("skipped", "0")) for ts in testsuites)
+report = {
+    "schema_version": 1,
+    "kind": "pr_n4_mac_integration_tests",
+    "host": {
+        "platform": platform.platform(),
+        "machine": platform.machine(),
+        "python": platform.python_version(),
+    },
+    "junit": {
+        "tests": total_tests, "failures": total_failures,
+        "errors": total_errors, "skipped": total_skipped,
+    },
+}
+with open(out_path, "w", encoding="utf-8") as fh:
+    json.dump(report, fh, indent=2)
+print(f"  -> {out_path}")
+PY
+
+echo
+echo "==> Done. Commit:"
+echo "    git add $out_dir/pr-n4-mac-*"
+echo "    git commit -m 'Mac M4 review evidence for PR-N4'"
+echo "    git push"
@@ -0,0 +1,170 @@
+"""Shared fixtures and marker plumbing for the integration suite.
+
+Tests under ``tests/integration/`` exercise the v0.3 runtime against
+**real** model weights — typically Qwen3-0.6B from the HF cache.
+They are NOT part of the Linux unit-test gate (model loading is
+HF-cache- and hardware-bound) and are NOT auto-discovered by a bare
+``pytest``: every test in this directory gets the
+``@pytest.mark.integration`` marker auto-applied below, and you opt
+in with ``pytest -m integration tests/integration/``.
+
+This conftest is created independently by PR-E1, PR-N1, PR-N2, PR-N3,
+and PR-N4 (they all branched off main while none had merged yet);
+the file content is the union and de-duplicates cleanly because each
+PR appends its own real-engine / real-runtime fixtures.
+
+Per ADR 0008 §9: this suite is the binding GA gate. Mac M4 reviewer
+scripts (``scripts/review_pr_n*_on_mac.sh``) drive it manually
+until PR-E2 ships the self-hosted runner workflow.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+def pytest_collection_modifyitems(config, items):  # noqa: ARG001
+    """Auto-mark every test under ``tests/integration/`` with
+    ``@pytest.mark.integration``."""
+    for item in items:
+        if "tests/integration/" in str(item.fspath):
+            item.add_marker(pytest.mark.integration)
+
+
+# ---------------------------------------------------------------------------
+# Real engine fixture — used by PR-N3's HTTP shim integration tests
+# and PR-N4's SDK integration tests.
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def real_speculative_engine():
+    """Real :class:`SpeculativeEngine` over Qwen3-0.6B."""
+    import torch
+
+    from inference_engine.proposer import SparseLogitsProposer
+    from inference_engine.server.engine import SpeculativeEngine
+    from kv_cache_proposer.proposer import ProposerConfig
+    from kv_cache_proposer.speculative import SpeculativeDecoder
+    from kv_cache_proposer.verifier import SinkWindowVerifier, VerifierConfig
+
+    proposer_cfg = ProposerConfig(dtype=torch.bfloat16, device="cpu")
+    verifier_cfg = VerifierConfig(
+        model_id="Qwen/Qwen3-0.6B",
+        dtype=torch.bfloat16, device="cpu",
+        sink_size=4, window_size=64,
+    )
+    proposer = SparseLogitsProposer(proposer_cfg)
+    verifier = SinkWindowVerifier(verifier_cfg)
+    decoder = SpeculativeDecoder(
+        proposer=proposer, verifier=verifier,
+        block_size=8, num_diffusion_steps=2,
+    )
+    return SpeculativeEngine(
+        decoder=decoder,
+        tokenizer=verifier.tokenizer,
+        model_id_label="kakeya-integration",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Real gRPC runtime fixture — used by PR-N4's SDK integration tests.
+# An in-process gRPC server backed by a real verifier on a background
+# thread, yielding the host:port string the SDK can connect to.
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def real_grpc_runtime_address():
+    """Run an in-process gRPC ``RuntimeService`` backed by a real
+    Qwen3-0.6B :class:`SinkWindowVerifier` on a background thread.
+
+    Yields the ``host:port`` address string the SDK can connect to.
+    Session-scoped: model load (~3-5 s on CPU) is paid once. Each
+    integration SDK test creates its own session via the SDK; the
+    underlying verifier is shared and reset on each ``prefill`` call.
+    """
+    import asyncio
+    import threading
+    import time
+
+    import grpc
+    import torch
+
+    from inference_engine.server.grpc_app import RuntimeServiceServicer
+    from inference_engine.server.proto_gen.kakeya.v1 import (
+        runtime_pb2_grpc,
+    )
+    from inference_engine.session import (
+        AppendTokensCoordinator,
+        GenerationCoordinator,
+        SessionStore,
+    )
+    from kv_cache_proposer.verifier import SinkWindowVerifier, VerifierConfig
+
+    verifier_cfg = VerifierConfig(
+        model_id="Qwen/Qwen3-0.6B",
+        dtype=torch.bfloat16, device="cpu",
+        sink_size=4, window_size=64,
+    )
+    verifier = SinkWindowVerifier(verifier_cfg)
+    store = SessionStore(capacity=4, cache_inspector=verifier)
+    append_coord = AppendTokensCoordinator(store, verifier)
+    gen_coord = GenerationCoordinator(store, verifier)
+
+    loop = asyncio.new_event_loop()
+    holder: dict = {
+        "server": None,
+        "port": None,
+        "started": threading.Event(),
+    }
+
+    async def _serve():
+        # Build the server INSIDE the worker thread's loop so any
+        # internal asyncio.Future is bound to this loop, not the
+        # main-thread default loop (the "Future attached to a
+        # different loop" failure PR-B4 hit).
+        server = grpc.aio.server()
+        runtime_pb2_grpc.add_RuntimeServiceServicer_to_server(
+            RuntimeServiceServicer(
+                store,
+                append_coordinator=append_coord,
+                generation_coordinator=gen_coord,
+            ),
+            server,
+        )
+        holder["server"] = server
+        holder["port"] = server.add_insecure_port("127.0.0.1:0")
+        await server.start()
+        holder["started"].set()
+        await server.wait_for_termination()
+
+    def _run():
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(_serve())
+
+    thread = threading.Thread(target=_run, daemon=True)
+    thread.start()
+    if not holder["started"].wait(timeout=15.0):
+        raise RuntimeError(
+            "background gRPC runtime failed to start within 15s",
+        )
+
+    address = f"127.0.0.1:{holder['port']}"
+    try:
+        yield address
+    finally:
+        async def _shutdown():
+            await holder["server"].stop(grace=0.1)
+
+        try:
+            fut = asyncio.run_coroutine_threadsafe(_shutdown(), loop)
+            fut.result(timeout=2.0)
+        except Exception:  # pragma: no cover - best-effort cleanup
+            pass
+        thread.join(timeout=2.0)
+        time.sleep(0.05)
+        try:
+            loop.close()
+        except Exception:  # pragma: no cover - best-effort cleanup
+            pass