Skip to content

Commit 6e9e9e4

Browse files
authored
Merge pull request #50 from FluffyAIcode/AgentMemory/v030-pr-e1-integration-suite-8e7f
PR-E1 (ADR 0008 §6.5): integration suite + INV-3 byte-exact GA gate
2 parents d12af68 + ab236f9 commit 6e9e9e4

5 files changed

Lines changed: 356 additions & 0 deletions

File tree

pytest.ini

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[pytest]
2+
# Project-wide pytest config. Kept minimal so the existing
3+
# convention of running pytest with explicit path arguments still
4+
# works the same — this file just registers custom markers so they
5+
# don't trigger PytestUnknownMarkWarning.
6+
markers =
7+
integration: integration tests that require real model weights / hardware (Mac M4 § ADR 0008 §9 GA gate; opted in via `pytest -m integration`).

scripts/review_pr_e1_on_mac.sh

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env bash
2+
# Mac M4 review aid for PR-E1 (ADR 0008 §6.5 integration suite +
3+
# INV-3 GA gate).
4+
#
5+
# This is the first PR whose Mac M4 evidence is **load-bearing for
6+
# v0.3 GA**: the INV-3 gate is GA gate G3 from ADR 0008 §7. Linux
7+
# unit tests cover the dispatch logic with a deterministic
8+
# FakeVerifier; the integration suite covers the same property
9+
# against the real Qwen3-0.6B verifier on the actual sampler
10+
# numerics, which only runs on Apple Silicon (or a CUDA host with
11+
# the right HF cache).
12+
#
13+
# Produces 1 artifact under results/platform-tests/:
14+
#
15+
# pr-e1-mac-integration-tests-<unix>.json
16+
# pytest -m integration tests/integration/ — 3 tests covering
17+
# the INV-3 byte-exact contract under three chunkings + a
18+
# repeated-run determinism check.
19+
#
20+
# Usage (from repo root, on Mac M4 / arm64):
21+
#
22+
# bash scripts/review_pr_e1_on_mac.sh
23+
#
24+
# Then commit the artifact:
25+
#
26+
# git add results/platform-tests/pr-e1-mac-*
27+
# git commit -m "Mac M4 review evidence for PR-E1"
28+
# git push
29+
#
30+
# Same `coverage run -m pytest` + `--include` filter pattern as
31+
# review_pr_b3_on_mac.sh — no `--source` flag, no
32+
# `COVERAGE_CORE=sysmon` env var, sidesteps the Python 3.13 /
33+
# coverage / torch race.
34+
#
35+
# The integration suite has no module under coverage; we don't
36+
# `--cov`-instrument it. The gate is functional (assert byte-equal
37+
# token streams), not coverage-based.
38+
39+
set -euo pipefail
40+
41+
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
42+
cd "$ROOT"
43+
44+
stamp="$(date +%s)"
45+
out_dir="results/platform-tests"
46+
mkdir -p "$out_dir"
47+
48+
junit="$out_dir/pr-e1-mac-integration-tests-${stamp}.junit.xml"
49+
report="$out_dir/pr-e1-mac-integration-tests-${stamp}.json"
50+
51+
echo "==> integration suite (INV-3 GA gate G3 against real Qwen3)"
52+
PYTHONPATH=.:sdks/python python3 -m pytest \
53+
-m integration \
54+
tests/integration/ \
55+
--junitxml="$junit" \
56+
-v
57+
58+
PYTHONPATH=.:sdks/python python3 - "$junit" "$report" <<'PY'
59+
import json
60+
import platform
61+
import sys
62+
import xml.etree.ElementTree as ET
63+
64+
junit_path, out_path = sys.argv[1:3]
65+
jr = ET.parse(junit_path).getroot()
66+
67+
# Same aggregate-from-inner-<testsuite> pattern as the other reviewer
68+
# scripts (commit 9d1a250).
69+
testsuites = list(jr.iter("testsuite"))
70+
total_tests = sum(int(ts.get("tests", "0")) for ts in testsuites)
71+
total_failures = sum(int(ts.get("failures", "0")) for ts in testsuites)
72+
total_errors = sum(int(ts.get("errors", "0")) for ts in testsuites)
73+
total_skipped = sum(int(ts.get("skipped", "0")) for ts in testsuites)
74+
75+
cases = []
76+
for tc in jr.iter("testcase"):
77+
cases.append({
78+
"classname": tc.get("classname"),
79+
"name": tc.get("name"),
80+
"time": float(tc.get("time", 0.0)),
81+
"outcome": (
82+
"failed" if tc.find("failure") is not None
83+
else "errored" if tc.find("error") is not None
84+
else "skipped" if tc.find("skipped") is not None
85+
else "passed"
86+
),
87+
})
88+
89+
report = {
90+
"schema_version": 1,
91+
"kind": "pr_e1_mac_integration_tests",
92+
"host": {
93+
"platform": platform.platform(),
94+
"machine": platform.machine(),
95+
"python": platform.python_version(),
96+
},
97+
"junit": {
98+
"tests": total_tests,
99+
"failures": total_failures,
100+
"errors": total_errors,
101+
"skipped": total_skipped,
102+
"cases": cases,
103+
},
104+
}
105+
with open(out_path, "w", encoding="utf-8") as fh:
106+
json.dump(report, fh, indent=2)
107+
print(f" -> {out_path}")
108+
PY
109+
110+
echo
111+
echo "==> Done."
112+
echo " Integration tests : $report"
113+
echo " JUnit : $junit"
114+
echo
115+
echo "Next:"
116+
echo " git add $out_dir/pr-e1-mac-*"
117+
echo " git commit -m 'Mac M4 review evidence for PR-E1'"
118+
echo " git push"

tests/integration/__init__.py

Whitespace-only changes.

tests/integration/conftest.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""Shared fixtures and marker plumbing for the integration suite.
2+
3+
Tests under ``tests/integration/`` exercise the v0.3 runtime against
4+
**real** model weights — typically the same Qwen3-0.6B verifier used
5+
by ``tests/core/``. They are NOT part of the Linux unit-test gate
6+
(coverage is platform-neutral; loading real weights is HF-cache- and
7+
hardware-bound), and are NOT auto-discovered by a bare ``pytest``
8+
invocation: every test in this directory carries the
9+
``@pytest.mark.integration`` marker, and you opt in with::
10+
11+
pytest -m integration tests/integration/
12+
13+
Per ADR 0008 §9, this suite is the binding GA gate. PR-E2 (a future
14+
PR) will add a self-hosted Mac M4 GitHub Actions workflow that runs
15+
``pytest -m integration`` on every PR labelled ``needs-mac-m4``;
16+
until that workflow lands, contributors run the suite manually on
17+
Mac M4 and push the resulting JSON / JUnit reports to the PR branch.
18+
"""
19+
20+
from __future__ import annotations
21+
22+
import pytest
23+
24+
25+
def pytest_collection_modifyitems(config, items): # noqa: ARG001
26+
"""Auto-mark every test under ``tests/integration/`` with
27+
``@pytest.mark.integration`` so contributors don't have to
28+
repeat the decorator on every test in this directory.
29+
30+
Standard pytest behavior: tests with this marker run only when
31+
explicitly selected via ``-m integration``; a bare ``pytest``
32+
invocation skips them.
33+
"""
34+
for item in items:
35+
# str(item.fspath) is reliable across pytest versions; "rootpath"
36+
# comparisons would also work but require a config dependency.
37+
if "tests/integration/" in str(item.fspath):
38+
item.add_marker(pytest.mark.integration)
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
"""ADR 0008 §7 GA gate G3 — INV-3 byte-exact determinism.
2+
3+
Drives two independent ``GenerationCoordinator`` instances against
4+
**real** Qwen3-0.6B verifiers through identical history fed via
5+
different chunkings, and asserts the resulting greedy token
6+
streams are byte-identical. This is the integration-level
7+
counterpart of the Linux unit test
8+
``tests/inference_engine/session/test_generator.py::TestDeterminism``,
9+
which uses the deterministic ``FakeVerifier`` to verify the
10+
**dispatch** is non-stateful; this file verifies the same property
11+
holds against the actual verifier numerics on the target hardware.
12+
13+
Replaces the deleted ``tests/core/test_determinism_gate.py`` (PR-A3
14+
removed it together with ``verifier.path_select``; the replacement
15+
landed here, in the integration suite, instead of in
16+
``tests/core/`` because integration is where Mac-M4-only GA gates
17+
belong per ADR 0008 §9).
18+
19+
Marker
20+
------
21+
This whole file inherits ``@pytest.mark.integration`` via
22+
``conftest.py``. Bare ``pytest`` skips it; opt in with::
23+
24+
pytest -m integration tests/integration/test_inv3_session_determinism_gate.py
25+
26+
Fixture cost
27+
------------
28+
``fresh_verifier_factory`` (from ``tests/conftest.py``) loads
29+
Qwen3-0.6B from the HF cache. On Mac M4 with a warm cache the load
30+
is <2 s; cold takes 10-30 s plus download. Weights are cached
31+
across tests in this file via ``session_verifier_pair``.
32+
"""
33+
34+
from __future__ import annotations
35+
36+
from typing import List
37+
38+
import pytest
39+
import torch
40+
41+
from inference_engine.session import (
42+
AppendTokensCoordinator,
43+
GenerationCoordinator,
44+
SessionStore,
45+
TokenEvent,
46+
)
47+
48+
49+
@pytest.fixture(scope="module")
50+
def session_verifier_pair():
51+
"""Two independent verifiers + stores + coordinator pairs.
52+
53+
Module-scoped: loading Qwen3-0.6B twice costs ~2-4 s on Mac M4
54+
with a warm HF cache. Tests share the pair; each test resets
55+
each verifier's state via ``reset()`` before driving its own
56+
workload, so cross-test bleed-over is impossible by construction.
57+
58+
Inline-build the verifier (rather than going through
59+
``fresh_verifier_factory`` which is function-scoped in
60+
``tests/conftest.py``) so the module scope is consistent —
61+
pytest forbids a module-scoped fixture depending on a function-
62+
scoped one.
63+
"""
64+
import torch
65+
from kv_cache_proposer.verifier import SinkWindowVerifier, VerifierConfig
66+
67+
def _build(sink: int, window: int) -> SinkWindowVerifier:
68+
return SinkWindowVerifier(
69+
VerifierConfig(
70+
dtype=torch.bfloat16,
71+
device="cpu",
72+
sink_size=sink,
73+
window_size=window,
74+
)
75+
)
76+
77+
fv_a = _build(sink=4, window=64)
78+
fv_b = _build(sink=4, window=64)
79+
yield fv_a, fv_b
80+
81+
82+
def _drive(
83+
*,
84+
verifier,
85+
chunks: List[List[int]],
86+
max_tokens: int,
87+
) -> List[int]:
88+
"""Set up a fresh SessionStore + coordinators on the given
89+
verifier, append the chunks in order, then greedy-generate and
90+
return the emitted token ids.
91+
"""
92+
verifier.reset()
93+
store = SessionStore(capacity=1, cache_inspector=verifier)
94+
append_coord = AppendTokensCoordinator(store, verifier)
95+
gen_coord = GenerationCoordinator(store, verifier)
96+
97+
sess = store.create_session()
98+
for chunk in chunks:
99+
append_coord.append_tokens(sess.session_id, chunk)
100+
101+
tokens: List[int] = []
102+
for ev in gen_coord.generate(sess.session_id, max_tokens=max_tokens):
103+
if isinstance(ev, TokenEvent):
104+
tokens.append(ev.token_id)
105+
return tokens
106+
107+
108+
def test_one_call_vs_two_calls_yield_byte_identical_tokens(
109+
session_verifier_pair,
110+
):
111+
"""The minimal INV-3 gate: same total token sequence delivered
112+
in 1 call vs. 2 calls produces bit-identical greedy output."""
113+
fv_a, fv_b = session_verifier_pair
114+
full_history = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
115+
116+
tokens_one_call = _drive(
117+
verifier=fv_a, chunks=[full_history], max_tokens=12,
118+
)
119+
tokens_two_calls = _drive(
120+
verifier=fv_b,
121+
chunks=[full_history[:5], full_history[5:]],
122+
max_tokens=12,
123+
)
124+
125+
assert tokens_one_call == tokens_two_calls, (
126+
f"INV-3 violated: chunking changed greedy output\n"
127+
f" one-call = {tokens_one_call!r}\n"
128+
f" two-calls = {tokens_two_calls!r}"
129+
)
130+
131+
132+
def test_chunking_invariance_across_three_splits(
133+
session_verifier_pair,
134+
):
135+
"""Stronger version: three different chunkings all produce the
136+
same final greedy stream. This catches any chunk-boundary
137+
numerical drift the 1-vs-2 case might miss (e.g., a bug that
138+
only triggers when a chunk crosses a sink+window trim
139+
boundary).
140+
141+
The verifier's sink+window is (4, 64) = 68 capacity. We pick a
142+
history short enough to stay under that bound on the first
143+
pass and long enough to span more than two chunkings.
144+
"""
145+
fv_a, fv_b = session_verifier_pair
146+
147+
full = [
148+
100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
149+
110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
150+
]
151+
152+
chunkings = [
153+
[full], # 1×20
154+
[full[:7], full[7:14], full[14:]], # 3×medium
155+
[full[i : i + 2] for i in range(0, 20, 2)], # 10×small
156+
]
157+
158+
runs = []
159+
for chunks in chunkings:
160+
# Alternate which verifier we use to keep state fully
161+
# disjoint across chunkings (we have two; the third
162+
# chunking reuses fv_a after a reset).
163+
verifier = fv_a if len(runs) % 2 == 0 else fv_b
164+
runs.append(_drive(verifier=verifier, chunks=chunks, max_tokens=8))
165+
166+
assert runs[0] == runs[1] == runs[2], (
167+
f"INV-3 violated: chunkings produced divergent token streams\n"
168+
f" 1×20 = {runs[0]!r}\n"
169+
f" 3×med = {runs[1]!r}\n"
170+
f" 10×sm = {runs[2]!r}"
171+
)
172+
173+
174+
def test_repeated_runs_with_same_history_byte_identical(
175+
session_verifier_pair,
176+
):
177+
"""Determinism in the trivial sense: running the SAME workload
178+
on the SAME verifier twice produces the same output. This is a
179+
sanity check against accidental RNG (greedy decoding has no
180+
legitimate source of nondeterminism)."""
181+
fv_a, _ = session_verifier_pair
182+
history = [42, 43, 44, 45, 46]
183+
184+
first = _drive(verifier=fv_a, chunks=[history], max_tokens=6)
185+
second = _drive(verifier=fv_a, chunks=[history], max_tokens=6)
186+
187+
assert first == second, (
188+
f"non-determinism in repeated greedy runs:\n"
189+
f" first = {first!r}\n"
190+
f" second = {second!r}"
191+
)
192+
# Sanity: greedy with a real verifier should produce SOMETHING.
193+
assert len(first) > 0

0 commit comments

Comments
 (0)