-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_inv3_session_determinism_gate.py
More file actions
193 lines (159 loc) · 6.48 KB
/
Copy pathtest_inv3_session_determinism_gate.py
File metadata and controls
193 lines (159 loc) · 6.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""ADR 0008 §7 GA gate G3 — INV-3 byte-exact determinism.
Drives two independent ``GenerationCoordinator`` instances against
**real** Qwen3-0.6B verifiers through identical history fed via
different chunkings, and asserts the resulting greedy token
streams are byte-identical. This is the integration-level
counterpart of the Linux unit test
``tests/inference_engine/session/test_generator.py::TestDeterminism``,
which uses the deterministic ``FakeVerifier`` to verify the
**dispatch** is non-stateful; this file verifies the same property
holds against the actual verifier numerics on the target hardware.
Replaces the deleted ``tests/core/test_determinism_gate.py`` (PR-A3
removed it together with ``verifier.path_select``; the replacement
landed here, in the integration suite, instead of in
``tests/core/`` because integration is where Mac-M4-only GA gates
belong per ADR 0008 §9).
Marker
------
This whole file inherits ``@pytest.mark.integration`` via
``conftest.py``. Bare ``pytest`` skips it; opt in with::
pytest -m integration tests/integration/test_inv3_session_determinism_gate.py
Fixture cost
------------
``fresh_verifier_factory`` (from ``tests/conftest.py``) loads
Qwen3-0.6B from the HF cache. On Mac M4 with a warm cache the load
is <2 s; cold takes 10-30 s plus download. Weights are cached
across tests in this file via ``session_verifier_pair``.
"""
from __future__ import annotations
from typing import List
import pytest
import torch
from inference_engine.session import (
AppendTokensCoordinator,
GenerationCoordinator,
SessionStore,
TokenEvent,
)
@pytest.fixture(scope="module")
def session_verifier_pair():
"""Two independent verifiers + stores + coordinator pairs.
Module-scoped: loading Qwen3-0.6B twice costs ~2-4 s on Mac M4
with a warm HF cache. Tests share the pair; each test resets
each verifier's state via ``reset()`` before driving its own
workload, so cross-test bleed-over is impossible by construction.
Inline-build the verifier (rather than going through
``fresh_verifier_factory`` which is function-scoped in
``tests/conftest.py``) so the module scope is consistent —
pytest forbids a module-scoped fixture depending on a function-
scoped one.
"""
import torch
from kv_cache_proposer.verifier import SinkWindowVerifier, VerifierConfig
def _build(sink: int, window: int) -> SinkWindowVerifier:
return SinkWindowVerifier(
VerifierConfig(
dtype=torch.bfloat16,
device="cpu",
sink_size=sink,
window_size=window,
)
)
fv_a = _build(sink=4, window=64)
fv_b = _build(sink=4, window=64)
yield fv_a, fv_b
def _drive(
*,
verifier,
chunks: List[List[int]],
max_tokens: int,
) -> List[int]:
"""Set up a fresh SessionStore + coordinators on the given
verifier, append the chunks in order, then greedy-generate and
return the emitted token ids.
"""
verifier.reset()
store = SessionStore(capacity=1, cache_inspector=verifier)
append_coord = AppendTokensCoordinator(store, verifier)
gen_coord = GenerationCoordinator(store, verifier)
sess = store.create_session()
for chunk in chunks:
append_coord.append_tokens(sess.session_id, chunk)
tokens: List[int] = []
for ev in gen_coord.generate(sess.session_id, max_tokens=max_tokens):
if isinstance(ev, TokenEvent):
tokens.append(ev.token_id)
return tokens
def test_one_call_vs_two_calls_yield_byte_identical_tokens(
session_verifier_pair,
):
"""The minimal INV-3 gate: same total token sequence delivered
in 1 call vs. 2 calls produces bit-identical greedy output."""
fv_a, fv_b = session_verifier_pair
full_history = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
tokens_one_call = _drive(
verifier=fv_a, chunks=[full_history], max_tokens=12,
)
tokens_two_calls = _drive(
verifier=fv_b,
chunks=[full_history[:5], full_history[5:]],
max_tokens=12,
)
assert tokens_one_call == tokens_two_calls, (
f"INV-3 violated: chunking changed greedy output\n"
f" one-call = {tokens_one_call!r}\n"
f" two-calls = {tokens_two_calls!r}"
)
def test_chunking_invariance_across_three_splits(
session_verifier_pair,
):
"""Stronger version: three different chunkings all produce the
same final greedy stream. This catches any chunk-boundary
numerical drift the 1-vs-2 case might miss (e.g., a bug that
only triggers when a chunk crosses a sink+window trim
boundary).
The verifier's sink+window is (4, 64) = 68 capacity. We pick a
history short enough to stay under that bound on the first
pass and long enough to span more than two chunkings.
"""
fv_a, fv_b = session_verifier_pair
full = [
100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
]
chunkings = [
[full], # 1×20
[full[:7], full[7:14], full[14:]], # 3×medium
[full[i : i + 2] for i in range(0, 20, 2)], # 10×small
]
runs = []
for chunks in chunkings:
# Alternate which verifier we use to keep state fully
# disjoint across chunkings (we have two; the third
# chunking reuses fv_a after a reset).
verifier = fv_a if len(runs) % 2 == 0 else fv_b
runs.append(_drive(verifier=verifier, chunks=chunks, max_tokens=8))
assert runs[0] == runs[1] == runs[2], (
f"INV-3 violated: chunkings produced divergent token streams\n"
f" 1×20 = {runs[0]!r}\n"
f" 3×med = {runs[1]!r}\n"
f" 10×sm = {runs[2]!r}"
)
def test_repeated_runs_with_same_history_byte_identical(
session_verifier_pair,
):
"""Determinism in the trivial sense: running the SAME workload
on the SAME verifier twice produces the same output. This is a
sanity check against accidental RNG (greedy decoding has no
legitimate source of nondeterminism)."""
fv_a, _ = session_verifier_pair
history = [42, 43, 44, 45, 46]
first = _drive(verifier=fv_a, chunks=[history], max_tokens=6)
second = _drive(verifier=fv_a, chunks=[history], max_tokens=6)
assert first == second, (
f"non-determinism in repeated greedy runs:\n"
f" first = {first!r}\n"
f" second = {second!r}"
)
# Sanity: greedy with a real verifier should produce SOMETHING.
assert len(first) > 0