|
1 | 1 | """Shared fixtures and marker plumbing for the integration suite. |
2 | 2 |
|
3 | 3 | Tests under ``tests/integration/`` exercise the v0.3 runtime against |
4 | | -**real** model weights — the same Qwen3-0.6B verifier used by |
5 | | -``tests/core/``. They are NOT part of the Linux unit-test gate |
6 | | -(coverage is platform-neutral; loading real weights is HF-cache- and |
7 | | -hardware-bound), and are NOT auto-discovered by a bare ``pytest`` |
8 | | -invocation: every test in this directory carries the |
9 | | -``@pytest.mark.integration`` marker, and you opt in with:: |
10 | | -
|
11 | | - pytest -m integration tests/integration/ |
12 | | -
|
13 | | -Per ADR 0008 §9, this suite is the binding GA gate. PR-E2 (a future |
14 | | -PR) will add a self-hosted Mac M4 GitHub Actions workflow that runs |
15 | | -``pytest -m integration`` on every PR labelled ``needs-mac-m4``; |
16 | | -until that workflow lands, contributors run the suite manually on |
17 | | -Mac M4 and push the resulting JSON / JUnit reports to the PR branch. |
| 4 | +**real** model weights — typically Qwen3-0.6B from the HF cache. |
| 5 | +They are NOT part of the Linux unit-test gate (model loading is |
| 6 | +HF-cache- and hardware-bound) and are NOT auto-discovered by a bare |
| 7 | +``pytest``: every test in this directory gets the |
| 8 | +``@pytest.mark.integration`` marker auto-applied below, and you opt |
| 9 | +in with ``pytest -m integration tests/integration/``. |
| 10 | +
|
| 11 | +This conftest is created independently by PR-E1, PR-N1, PR-N2, PR-N3, |
| 12 | +and PR-N4 (they all branched off main while none had merged yet); |
| 13 | +the file content is the union and de-duplicates cleanly because each |
| 14 | +PR appends its own real-engine / real-runtime fixtures. |
| 15 | +
|
| 16 | +Per ADR 0008 §9: this suite is the binding GA gate. Mac M4 reviewer |
| 17 | +scripts (``scripts/review_pr_n*_on_mac.sh``) drive it manually |
| 18 | +until PR-E2 ships the self-hosted runner workflow. |
18 | 19 | """ |
19 | 20 |
|
20 | 21 | from __future__ import annotations |
@@ -80,3 +81,106 @@ def real_speculative_engine(): |
80 | 81 | tokenizer=verifier.tokenizer, |
81 | 82 | model_id_label="kakeya-integration", |
82 | 83 | ) |
| 84 | + |
| 85 | + |
| 86 | +# --------------------------------------------------------------------------- |
| 87 | +# Real gRPC runtime fixture — used by PR-N4's SDK integration tests. |
| 88 | +# An in-process gRPC server backed by a real verifier on a background |
| 89 | +# thread, yielding the host:port string the SDK can connect to. |
| 90 | +# --------------------------------------------------------------------------- |
| 91 | + |
| 92 | + |
| 93 | +@pytest.fixture(scope="session") |
| 94 | +def real_grpc_runtime_address(): |
| 95 | + """Run an in-process gRPC ``RuntimeService`` backed by a real |
| 96 | + Qwen3-0.6B :class:`SinkWindowVerifier` on a background thread. |
| 97 | +
|
| 98 | + Yields the ``host:port`` address string the SDK can connect to. |
| 99 | + Session-scoped: model load (~3-5 s on CPU) is paid once. Each |
| 100 | + integration SDK test creates its own session via the SDK; the |
| 101 | + underlying verifier is shared and reset on each ``prefill`` call. |
| 102 | + """ |
| 103 | + import asyncio |
| 104 | + import threading |
| 105 | + import time |
| 106 | + |
| 107 | + import grpc |
| 108 | + import torch |
| 109 | + |
| 110 | + from inference_engine.server.grpc_app import RuntimeServiceServicer |
| 111 | + from inference_engine.server.proto_gen.kakeya.v1 import ( |
| 112 | + runtime_pb2_grpc, |
| 113 | + ) |
| 114 | + from inference_engine.session import ( |
| 115 | + AppendTokensCoordinator, |
| 116 | + GenerationCoordinator, |
| 117 | + SessionStore, |
| 118 | + ) |
| 119 | + from kv_cache_proposer.verifier import SinkWindowVerifier, VerifierConfig |
| 120 | + |
| 121 | + verifier_cfg = VerifierConfig( |
| 122 | + model_id="Qwen/Qwen3-0.6B", |
| 123 | + dtype=torch.bfloat16, device="cpu", |
| 124 | + sink_size=4, window_size=64, |
| 125 | + ) |
| 126 | + verifier = SinkWindowVerifier(verifier_cfg) |
| 127 | + store = SessionStore(capacity=4, cache_inspector=verifier) |
| 128 | + append_coord = AppendTokensCoordinator(store, verifier) |
| 129 | + gen_coord = GenerationCoordinator(store, verifier) |
| 130 | + |
| 131 | + loop = asyncio.new_event_loop() |
| 132 | + holder: dict = { |
| 133 | + "server": None, |
| 134 | + "port": None, |
| 135 | + "started": threading.Event(), |
| 136 | + } |
| 137 | + |
| 138 | + async def _serve(): |
| 139 | + # Build the server INSIDE the worker thread's loop so any |
| 140 | + # internal asyncio.Future is bound to this loop, not the |
| 141 | + # main-thread default loop (the "Future attached to a |
| 142 | + # different loop" failure PR-B4 hit). |
| 143 | + server = grpc.aio.server() |
| 144 | + runtime_pb2_grpc.add_RuntimeServiceServicer_to_server( |
| 145 | + RuntimeServiceServicer( |
| 146 | + store, |
| 147 | + append_coordinator=append_coord, |
| 148 | + generation_coordinator=gen_coord, |
| 149 | + ), |
| 150 | + server, |
| 151 | + ) |
| 152 | + holder["server"] = server |
| 153 | + holder["port"] = server.add_insecure_port("127.0.0.1:0") |
| 154 | + await server.start() |
| 155 | + holder["started"].set() |
| 156 | + await server.wait_for_termination() |
| 157 | + |
| 158 | + def _run(): |
| 159 | + asyncio.set_event_loop(loop) |
| 160 | + loop.run_until_complete(_serve()) |
| 161 | + |
| 162 | + thread = threading.Thread(target=_run, daemon=True) |
| 163 | + thread.start() |
| 164 | + if not holder["started"].wait(timeout=15.0): |
| 165 | + raise RuntimeError( |
| 166 | + "background gRPC runtime failed to start within 15s", |
| 167 | + ) |
| 168 | + |
| 169 | + address = f"127.0.0.1:{holder['port']}" |
| 170 | + try: |
| 171 | + yield address |
| 172 | + finally: |
| 173 | + async def _shutdown(): |
| 174 | + await holder["server"].stop(grace=0.1) |
| 175 | + |
| 176 | + try: |
| 177 | + fut = asyncio.run_coroutine_threadsafe(_shutdown(), loop) |
| 178 | + fut.result(timeout=2.0) |
| 179 | + except Exception: # pragma: no cover - best-effort cleanup |
| 180 | + pass |
| 181 | + thread.join(timeout=2.0) |
| 182 | + time.sleep(0.05) |
| 183 | + try: |
| 184 | + loop.close() |
| 185 | + except Exception: # pragma: no cover - best-effort cleanup |
| 186 | + pass |
0 commit comments