Merge pull request #50 from FluffyAIcode/AgentMemory/v030-pr-e1-integration-suite-8e7f

FluffyAIcode · web-flow · commit 6e9e9e48e556 · 2026-06-02T12:02:35.000+08:00
PR-E1 (ADR 0008 §6.5): integration suite + INV-3 byte-exact GA gate
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+# Project-wide pytest config. Kept minimal so the existing
+# convention of running pytest with explicit path arguments still
+# works the same — this file just registers custom markers so they
+# don't trigger PytestUnknownMarkWarning.
+markers =
+    integration: integration tests that require real model weights / hardware (Mac M4 § ADR 0008 §9 GA gate; opted in via `pytest -m integration`).
diff --git a/scripts/review_pr_e1_on_mac.sh b/scripts/review_pr_e1_on_mac.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+# Mac M4 review aid for PR-E1 (ADR 0008 §6.5 integration suite +
+# INV-3 GA gate).
+#
+# This is the first PR whose Mac M4 evidence is **load-bearing for
+# v0.3 GA**: the INV-3 gate is GA gate G3 from ADR 0008 §7. Linux
+# unit tests cover the dispatch logic with a deterministic
+# FakeVerifier; the integration suite covers the same property
+# against the real Qwen3-0.6B verifier on the actual sampler
+# numerics, which only runs on Apple Silicon (or a CUDA host with
+# the right HF cache).
+#
+# Produces 1 artifact under results/platform-tests/:
+#
+#   pr-e1-mac-integration-tests-<unix>.json
+#     pytest -m integration tests/integration/ — 3 tests covering
+#     the INV-3 byte-exact contract under three chunkings + a
+#     repeated-run determinism check.
+#
+# Usage (from repo root, on Mac M4 / arm64):
+#
+#     bash scripts/review_pr_e1_on_mac.sh
+#
+# Then commit the artifact:
+#
+#     git add results/platform-tests/pr-e1-mac-*
+#     git commit -m "Mac M4 review evidence for PR-E1"
+#     git push
+#
+# Same `coverage run -m pytest` + `--include` filter pattern as
+# review_pr_b3_on_mac.sh — no `--source` flag, no
+# `COVERAGE_CORE=sysmon` env var, sidesteps the Python 3.13 /
+# coverage / torch race.
+#
+# The integration suite has no module under coverage; we don't
+# `--cov`-instrument it. The gate is functional (assert byte-equal
+# token streams), not coverage-based.
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+stamp="$(date +%s)"
+out_dir="results/platform-tests"
+mkdir -p "$out_dir"
+
+junit="$out_dir/pr-e1-mac-integration-tests-${stamp}.junit.xml"
+report="$out_dir/pr-e1-mac-integration-tests-${stamp}.json"
+
+echo "==> integration suite (INV-3 GA gate G3 against real Qwen3)"
+PYTHONPATH=.:sdks/python python3 -m pytest \
+    -m integration \
+    tests/integration/ \
+    --junitxml="$junit" \
+    -v
+
+PYTHONPATH=.:sdks/python python3 - "$junit" "$report" <<'PY'
+import json
+import platform
+import sys
+import xml.etree.ElementTree as ET
+
+junit_path, out_path = sys.argv[1:3]
+jr = ET.parse(junit_path).getroot()
+
+# Same aggregate-from-inner-<testsuite> pattern as the other reviewer
+# scripts (commit 9d1a250).
+testsuites = list(jr.iter("testsuite"))
+total_tests = sum(int(ts.get("tests", "0")) for ts in testsuites)
+total_failures = sum(int(ts.get("failures", "0")) for ts in testsuites)
+total_errors = sum(int(ts.get("errors", "0")) for ts in testsuites)
+total_skipped = sum(int(ts.get("skipped", "0")) for ts in testsuites)
+
+cases = []
+for tc in jr.iter("testcase"):
+    cases.append({
+        "classname": tc.get("classname"),
+        "name": tc.get("name"),
+        "time": float(tc.get("time", 0.0)),
+        "outcome": (
+            "failed" if tc.find("failure") is not None
+            else "errored" if tc.find("error") is not None
+            else "skipped" if tc.find("skipped") is not None
+            else "passed"
+        ),
+    })
+
+report = {
+    "schema_version": 1,
+    "kind": "pr_e1_mac_integration_tests",
+    "host": {
+        "platform": platform.platform(),
+        "machine": platform.machine(),
+        "python": platform.python_version(),
+    },
+    "junit": {
+        "tests": total_tests,
+        "failures": total_failures,
+        "errors": total_errors,
+        "skipped": total_skipped,
+        "cases": cases,
+    },
+}
+with open(out_path, "w", encoding="utf-8") as fh:
+    json.dump(report, fh, indent=2)
+print(f"  -> {out_path}")
+PY
+
+echo
+echo "==> Done."
+echo "    Integration tests : $report"
+echo "    JUnit             : $junit"
+echo
+echo "Next:"
+echo "    git add $out_dir/pr-e1-mac-*"
+echo "    git commit -m 'Mac M4 review evidence for PR-E1'"
+echo "    git push"
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -0,0 +1,38 @@
+"""Shared fixtures and marker plumbing for the integration suite.
+
+Tests under ``tests/integration/`` exercise the v0.3 runtime against
+**real** model weights — typically the same Qwen3-0.6B verifier used
+by ``tests/core/``. They are NOT part of the Linux unit-test gate
+(coverage is platform-neutral; loading real weights is HF-cache- and
+hardware-bound), and are NOT auto-discovered by a bare ``pytest``
+invocation: every test in this directory carries the
+``@pytest.mark.integration`` marker, and you opt in with::
+
+    pytest -m integration tests/integration/
+
+Per ADR 0008 §9, this suite is the binding GA gate. PR-E2 (a future
+PR) will add a self-hosted Mac M4 GitHub Actions workflow that runs
+``pytest -m integration`` on every PR labelled ``needs-mac-m4``;
+until that workflow lands, contributors run the suite manually on
+Mac M4 and push the resulting JSON / JUnit reports to the PR branch.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+def pytest_collection_modifyitems(config, items):  # noqa: ARG001
+    """Auto-mark every test under ``tests/integration/`` with
+    ``@pytest.mark.integration`` so contributors don't have to
+    repeat the decorator on every test in this directory.
+
+    Standard pytest behavior: tests with this marker run only when
+    explicitly selected via ``-m integration``; a bare ``pytest``
+    invocation skips them.
+    """
+    for item in items:
+        # str(item.fspath) is reliable across pytest versions; "rootpath"
+        # comparisons would also work but require a config dependency.
+        if "tests/integration/" in str(item.fspath):
+            item.add_marker(pytest.mark.integration)
diff --git a/tests/integration/test_inv3_session_determinism_gate.py b/tests/integration/test_inv3_session_determinism_gate.py
@@ -0,0 +1,193 @@
+"""ADR 0008 §7 GA gate G3 — INV-3 byte-exact determinism.
+
+Drives two independent ``GenerationCoordinator`` instances against
+**real** Qwen3-0.6B verifiers through identical history fed via
+different chunkings, and asserts the resulting greedy token
+streams are byte-identical. This is the integration-level
+counterpart of the Linux unit test
+``tests/inference_engine/session/test_generator.py::TestDeterminism``,
+which uses the deterministic ``FakeVerifier`` to verify the
+**dispatch** is non-stateful; this file verifies the same property
+holds against the actual verifier numerics on the target hardware.
+
+Replaces the deleted ``tests/core/test_determinism_gate.py`` (PR-A3
+removed it together with ``verifier.path_select``; the replacement
+landed here, in the integration suite, instead of in
+``tests/core/`` because integration is where Mac-M4-only GA gates
+belong per ADR 0008 §9).
+
+Marker
+------
+This whole file inherits ``@pytest.mark.integration`` via
+``conftest.py``. Bare ``pytest`` skips it; opt in with::
+
+    pytest -m integration tests/integration/test_inv3_session_determinism_gate.py
+
+Fixture cost
+------------
+``fresh_verifier_factory`` (from ``tests/conftest.py``) loads
+Qwen3-0.6B from the HF cache. On Mac M4 with a warm cache the load
+is <2 s; cold takes 10-30 s plus download. Weights are cached
+across tests in this file via ``session_verifier_pair``.
+"""
+
+from __future__ import annotations
+
+from typing import List
+
+import pytest
+import torch
+
+from inference_engine.session import (
+    AppendTokensCoordinator,
+    GenerationCoordinator,
+    SessionStore,
+    TokenEvent,
+)
+
+
+@pytest.fixture(scope="module")
+def session_verifier_pair():
+    """Two independent verifiers + stores + coordinator pairs.
+
+    Module-scoped: loading Qwen3-0.6B twice costs ~2-4 s on Mac M4
+    with a warm HF cache. Tests share the pair; each test resets
+    each verifier's state via ``reset()`` before driving its own
+    workload, so cross-test bleed-over is impossible by construction.
+
+    Inline-build the verifier (rather than going through
+    ``fresh_verifier_factory`` which is function-scoped in
+    ``tests/conftest.py``) so the module scope is consistent —
+    pytest forbids a module-scoped fixture depending on a function-
+    scoped one.
+    """
+    import torch
+    from kv_cache_proposer.verifier import SinkWindowVerifier, VerifierConfig
+
+    def _build(sink: int, window: int) -> SinkWindowVerifier:
+        return SinkWindowVerifier(
+            VerifierConfig(
+                dtype=torch.bfloat16,
+                device="cpu",
+                sink_size=sink,
+                window_size=window,
+            )
+        )
+
+    fv_a = _build(sink=4, window=64)
+    fv_b = _build(sink=4, window=64)
+    yield fv_a, fv_b
+
+
+def _drive(
+    *,
+    verifier,
+    chunks: List[List[int]],
+    max_tokens: int,
+) -> List[int]:
+    """Set up a fresh SessionStore + coordinators on the given
+    verifier, append the chunks in order, then greedy-generate and
+    return the emitted token ids.
+    """
+    verifier.reset()
+    store = SessionStore(capacity=1, cache_inspector=verifier)
+    append_coord = AppendTokensCoordinator(store, verifier)
+    gen_coord = GenerationCoordinator(store, verifier)
+
+    sess = store.create_session()
+    for chunk in chunks:
+        append_coord.append_tokens(sess.session_id, chunk)
+
+    tokens: List[int] = []
+    for ev in gen_coord.generate(sess.session_id, max_tokens=max_tokens):
+        if isinstance(ev, TokenEvent):
+            tokens.append(ev.token_id)
+    return tokens
+
+
+def test_one_call_vs_two_calls_yield_byte_identical_tokens(
+    session_verifier_pair,
+):
+    """The minimal INV-3 gate: same total token sequence delivered
+    in 1 call vs. 2 calls produces bit-identical greedy output."""
+    fv_a, fv_b = session_verifier_pair
+    full_history = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+
+    tokens_one_call = _drive(
+        verifier=fv_a, chunks=[full_history], max_tokens=12,
+    )
+    tokens_two_calls = _drive(
+        verifier=fv_b,
+        chunks=[full_history[:5], full_history[5:]],
+        max_tokens=12,
+    )
+
+    assert tokens_one_call == tokens_two_calls, (
+        f"INV-3 violated: chunking changed greedy output\n"
+        f"  one-call    = {tokens_one_call!r}\n"
+        f"  two-calls   = {tokens_two_calls!r}"
+    )
+
+
+def test_chunking_invariance_across_three_splits(
+    session_verifier_pair,
+):
+    """Stronger version: three different chunkings all produce the
+    same final greedy stream. This catches any chunk-boundary
+    numerical drift the 1-vs-2 case might miss (e.g., a bug that
+    only triggers when a chunk crosses a sink+window trim
+    boundary).
+
+    The verifier's sink+window is (4, 64) = 68 capacity. We pick a
+    history short enough to stay under that bound on the first
+    pass and long enough to span more than two chunkings.
+    """
+    fv_a, fv_b = session_verifier_pair
+
+    full = [
+        100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+        110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    ]
+
+    chunkings = [
+        [full],                                              # 1×20
+        [full[:7], full[7:14], full[14:]],                   # 3×medium
+        [full[i : i + 2] for i in range(0, 20, 2)],          # 10×small
+    ]
+
+    runs = []
+    for chunks in chunkings:
+        # Alternate which verifier we use to keep state fully
+        # disjoint across chunkings (we have two; the third
+        # chunking reuses fv_a after a reset).
+        verifier = fv_a if len(runs) % 2 == 0 else fv_b
+        runs.append(_drive(verifier=verifier, chunks=chunks, max_tokens=8))
+
+    assert runs[0] == runs[1] == runs[2], (
+        f"INV-3 violated: chunkings produced divergent token streams\n"
+        f"  1×20   = {runs[0]!r}\n"
+        f"  3×med  = {runs[1]!r}\n"
+        f"  10×sm  = {runs[2]!r}"
+    )
+
+
+def test_repeated_runs_with_same_history_byte_identical(
+    session_verifier_pair,
+):
+    """Determinism in the trivial sense: running the SAME workload
+    on the SAME verifier twice produces the same output. This is a
+    sanity check against accidental RNG (greedy decoding has no
+    legitimate source of nondeterminism)."""
+    fv_a, _ = session_verifier_pair
+    history = [42, 43, 44, 45, 46]
+
+    first = _drive(verifier=fv_a, chunks=[history], max_tokens=6)
+    second = _drive(verifier=fv_a, chunks=[history], max_tokens=6)
+
+    assert first == second, (
+        f"non-determinism in repeated greedy runs:\n"
+        f"  first  = {first!r}\n"
+        f"  second = {second!r}"
+    )
+    # Sanity: greedy with a real verifier should produce SOMETHING.
+    assert len(first) > 0