Skip to content

Commit 94b0942

Browse files
committed
Merge remote-tracking branch 'origin/orchestrator/crossval'
2 parents d11696c + 08ee6b0 commit 94b0942

16 files changed

Lines changed: 845 additions & 12 deletions

File tree

.github/workflows/ci.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,12 @@ jobs:
261261
--inline-logs -v \
262262
-O twister-out/benchmarks
263263
264+
# Parse benchmark timing and print summary table.
265+
# No fail threshold yet — we need baseline data first.
266+
- name: Benchmark timing summary
267+
if: always()
268+
run: python app/tools/parse_benchmark.py twister-out/benchmarks/
269+
264270
# All 17 samples are build_only; CI proves they compile clean.
265271
- name: Twister — samples
266272
run: |
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
# SPDX-License-Identifier: MIT
2+
"""Cross-validation harness — proves Python evaluator matches golden vectors.
3+
4+
Each vector in tests/vectors/<name>/vector.json contains an inline ARB model,
5+
input facts/timestamps, and expected outputs. The Python evaluator is the
6+
reference implementation; these vectors will also be consumed by the C engine
7+
under Zephyr to prove cross-platform equivalence.
8+
9+
Tests:
10+
1. Parametrised golden-vector evaluation (10+ vectors).
11+
2. Determinism: same input, 100 runs → identical output.
12+
3. Compile-to-C: each vector model compiles and the generated source
13+
contains the required ARBITER_generated_model symbol.
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import json
19+
import tempfile
20+
from pathlib import Path
21+
22+
import pytest
23+
24+
from arbiter.compiler import CompileOptions, compile_model
25+
from arbiter.evaluator import ArbiterEvaluator
26+
27+
VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"
28+
29+
30+
# ---------------------------------------------------------------------------
31+
# Helpers
32+
# ---------------------------------------------------------------------------
33+
34+
35+
def _discover_vectors() -> list[str]:
36+
"""Return sorted list of vector directory names that contain vector.json."""
37+
if not VECTORS_DIR.exists():
38+
return []
39+
return sorted(
40+
d.name
41+
for d in VECTORS_DIR.iterdir()
42+
if d.is_dir() and (d / "vector.json").exists()
43+
)
44+
45+
46+
def _load_vector(name: str) -> dict:
47+
"""Load and parse a vector.json file."""
48+
path = VECTORS_DIR / name / "vector.json"
49+
return json.loads(path.read_text(encoding="utf-8"))
50+
51+
52+
def _run_vector(vec: dict) -> tuple[ArbiterEvaluator, dict]:
53+
"""Run the Python evaluator on a vector and return (evaluator, result_dict)."""
54+
model_data = vec["model"]
55+
ev = ArbiterEvaluator(model_data)
56+
57+
# Set fact values
58+
for fact_name, value in vec.get("facts", {}).items():
59+
ev.set_fact(fact_name, value)
60+
61+
# Set timestamps
62+
for fact_name, ms in vec.get("timestamps", {}).items():
63+
ev.set_timestamp(fact_name, ms)
64+
65+
# Set snapshot timestamp
66+
snap_ts = vec.get("snapshot_timestamp_ms", 0)
67+
if snap_ts:
68+
ev.set_snapshot_timestamp(snap_ts)
69+
70+
result = ev.eval()
71+
return ev, result
72+
73+
74+
# ---------------------------------------------------------------------------
75+
# 1. Golden vector evaluation
76+
# ---------------------------------------------------------------------------
77+
78+
_VECTOR_NAMES = _discover_vectors()
79+
80+
81+
@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
82+
def test_golden_vector(vector_name: str) -> None:
83+
"""Evaluate each golden vector and assert output matches expected."""
84+
if vector_name == "_no_vectors_":
85+
pytest.fail("No golden vectors found in tests/vectors/")
86+
87+
vec = _load_vector(vector_name)
88+
expected = vec["expected"]
89+
90+
ev, result = _run_vector(vec)
91+
92+
# --- fired_rules: exact ordered list ---
93+
assert result.fired_rules == expected["fired_rules"], (
94+
f"[{vector_name}] fired_rules mismatch"
95+
)
96+
97+
# --- current_mode ---
98+
assert result.current_mode == expected.get("current_mode"), (
99+
f"[{vector_name}] current_mode mismatch"
100+
)
101+
102+
# --- raised_faults: sorted set comparison ---
103+
assert sorted(result.raised_faults) == sorted(expected.get("raised_faults", [])), (
104+
f"[{vector_name}] raised_faults mismatch"
105+
)
106+
107+
# --- requested_actions: ordered list ---
108+
assert result.requested_actions == expected.get("requested_actions", []), (
109+
f"[{vector_name}] requested_actions mismatch"
110+
)
111+
112+
# --- fact_values: spot-check only the facts listed in expected ---
113+
expected_facts = expected.get("fact_values", {})
114+
for fact_name, expected_val in expected_facts.items():
115+
actual = ev._fact_values.get(fact_name)
116+
assert actual == expected_val, (
117+
f"[{vector_name}] fact {fact_name}: expected {expected_val}, got {actual}"
118+
)
119+
120+
121+
# ---------------------------------------------------------------------------
122+
# 2. Determinism — same input, 100 runs, identical output
123+
# ---------------------------------------------------------------------------
124+
125+
126+
@pytest.mark.parametrize("vector_name", _VECTOR_NAMES[:3] or ["_no_vectors_"])
127+
def test_determinism(vector_name: str) -> None:
128+
"""Run the same vector 100 times and assert all outputs are identical."""
129+
if vector_name == "_no_vectors_":
130+
pytest.skip("No vectors for determinism test")
131+
132+
vec = _load_vector(vector_name)
133+
results: list[dict] = []
134+
135+
for _ in range(100):
136+
_, result = _run_vector(vec)
137+
results.append(result.to_dict())
138+
139+
baseline = results[0]
140+
for i, r in enumerate(results[1:], start=1):
141+
assert r == baseline, (
142+
f"[{vector_name}] Non-deterministic result on iteration {i}"
143+
)
144+
145+
146+
# ---------------------------------------------------------------------------
147+
# 3. Compile-to-C — verify each vector model compiles to valid C source
148+
# ---------------------------------------------------------------------------
149+
150+
151+
@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
152+
def test_compile_to_c(vector_name: str) -> None:
153+
"""Compile each vector model to C and verify the source contains required symbols."""
154+
if vector_name == "_no_vectors_":
155+
pytest.skip("No vectors for compile test")
156+
157+
vec = _load_vector(vector_name)
158+
model_data = vec["model"]
159+
160+
with tempfile.TemporaryDirectory() as tmpdir:
161+
tmp = Path(tmpdir)
162+
# Write model as YAML for the compiler
163+
import yaml
164+
165+
model_path = tmp / "model.arb.yaml"
166+
model_path.write_text(
167+
yaml.dump(model_data, default_flow_style=False), encoding="utf-8"
168+
)
169+
170+
opts = CompileOptions(
171+
out_c=tmp / "model.c",
172+
out_h=tmp / "model.h",
173+
)
174+
result = compile_model(model_path, opts)
175+
176+
assert result.success, (
177+
f"[{vector_name}] Compilation failed: "
178+
+ "; ".join(
179+
d.message
180+
for d in result.diagnostics.errors
181+
)
182+
)
183+
184+
# Verify generated C source contains required symbols
185+
c_source = (tmp / "model.c").read_text(encoding="utf-8")
186+
h_source = (tmp / "model.h").read_text(encoding="utf-8")
187+
188+
assert "ARBITER_generated_model" in c_source, (
189+
f"[{vector_name}] Missing ARBITER_generated_model in C source"
190+
)
191+
assert "ARBITER_generated_model" in h_source, (
192+
f"[{vector_name}] Missing ARBITER_generated_model in header"
193+
)
194+
assert "ARBITER_MODEL_HASH" in h_source, (
195+
f"[{vector_name}] Missing ARBITER_MODEL_HASH in header"
196+
)
Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,58 @@
11
# SPDX-License-Identifier: MIT
22
"""Golden vector tests — framework for verifying deterministic evaluation.
33
4-
Each model in tests/vectors/ should include:
5-
- input_snapshot.json
6-
- expected_result.json
7-
- expected_trace.json
4+
Each subdirectory under tests/vectors/ contains a vector.json with an inline
5+
ARB model, input facts/timestamps, and expected results.
86
97
The same vectors are tested by:
10-
- Python reference evaluator (this file)
8+
- Python reference evaluator (this file + test_cross_validation.py)
119
- Generated C runtime under Zephyr
1210
- Blob runtime under Zephyr
11+
12+
NOTE: The comprehensive cross-validation tests are in test_cross_validation.py.
13+
This file is kept for backwards compatibility with the original vector
14+
discovery mechanism.
1315
"""
1416

1517
import json
1618
from pathlib import Path
1719

1820
import pytest
1921

22+
from arbiter.evaluator import ArbiterEvaluator
23+
2024
VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"
2125

2226

2327
def _load_vectors():
24-
"""Discover and load golden vector test cases."""
28+
"""Discover golden vector test cases using vector.json format."""
2529
if not VECTORS_DIR.exists():
2630
return []
2731
vectors = []
2832
for d in sorted(VECTORS_DIR.iterdir()):
29-
if d.is_dir() and (d / "input_snapshot.json").exists():
33+
if d.is_dir() and (d / "vector.json").exists():
3034
vectors.append(d.name)
3135
return vectors
3236

3337

3438
@pytest.mark.parametrize("vector_name", _load_vectors() or ["placeholder"])
3539
def test_golden_vector(vector_name):
36-
"""Verify golden vector produces expected result."""
40+
"""Verify golden vector produces expected result via Python evaluator."""
3741
if vector_name == "placeholder":
3842
pytest.skip("No golden vectors yet — add to tests/vectors/")
3943
vector_dir = VECTORS_DIR / vector_name
40-
input_snap = json.loads((vector_dir / "input_snapshot.json").read_text())
41-
expected = json.loads((vector_dir / "expected_result.json").read_text())
42-
# TODO: implement Python reference evaluator and compare
43-
pytest.skip("Python reference evaluator not yet implemented")
44+
vec = json.loads((vector_dir / "vector.json").read_text(encoding="utf-8"))
45+
model_data = vec["model"]
46+
expected = vec["expected"]
47+
48+
ev = ArbiterEvaluator(model_data)
49+
for fact_name, value in vec.get("facts", {}).items():
50+
ev.set_fact(fact_name, value)
51+
for fact_name, ms in vec.get("timestamps", {}).items():
52+
ev.set_timestamp(fact_name, ms)
53+
snap_ts = vec.get("snapshot_timestamp_ms", 0)
54+
if snap_ts:
55+
ev.set_snapshot_timestamp(snap_ts)
56+
57+
result = ev.eval()
58+
assert result.fired_rules == expected["fired_rules"]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"description": "Basic evaluation: unconditional rule always fires",
3+
"model": {
4+
"arb_version": 0.1,
5+
"model": "vec_basic_eval",
6+
"target": {"rtos": "zephyr"},
7+
"facts": [
8+
{"id": "sensor.value", "type": "int32", "default": 0}
9+
],
10+
"rules": [
11+
{
12+
"id": "rule.always_on",
13+
"class": "inference",
14+
"then": {"explanation": "Unconditional rule fires every tick"}
15+
}
16+
],
17+
"actions": [],
18+
"modes": []
19+
},
20+
"facts": {"sensor.value": 42},
21+
"timestamps": {},
22+
"snapshot_timestamp_ms": 0,
23+
"expected": {
24+
"fired_rules": ["rule.always_on"],
25+
"current_mode": null,
26+
"raised_faults": [],
27+
"requested_actions": [],
28+
"fact_values": {"sensor.value": 42}
29+
}
30+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"description": "Safety guard ordering: safety_guard fires before inference regardless of declaration order",
3+
"model": {
4+
"arb_version": 0.1,
5+
"model": "vec_safety_ordering",
6+
"target": {"rtos": "zephyr"},
7+
"facts": [
8+
{"id": "temp_c", "type": "int32", "default": 0}
9+
],
10+
"rules": [
11+
{
12+
"id": "rule.inference_first_alpha",
13+
"class": "inference",
14+
"then": {"explanation": "Inference rule"}
15+
},
16+
{
17+
"id": "rule.advisory_z",
18+
"class": "advisory",
19+
"then": {"explanation": "Advisory rule"}
20+
},
21+
{
22+
"id": "rule.safety_override",
23+
"class": "safety_guard",
24+
"then": {"explanation": "Safety guard fires first"}
25+
}
26+
],
27+
"actions": [],
28+
"modes": []
29+
},
30+
"facts": {"temp_c": 100},
31+
"timestamps": {},
32+
"snapshot_timestamp_ms": 0,
33+
"expected": {
34+
"fired_rules": ["rule.safety_override", "rule.inference_first_alpha", "rule.advisory_z"],
35+
"current_mode": null,
36+
"raised_faults": [],
37+
"requested_actions": [],
38+
"fact_values": {"temp_c": 100}
39+
}
40+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"description": "Expression opcode: scale — target = (left * right) / scale",
3+
"model": {
4+
"arb_version": 0.1,
5+
"model": "vec_expr_scale",
6+
"target": {"rtos": "zephyr"},
7+
"facts": [
8+
{"id": "input_a", "type": "int32", "default": 0},
9+
{"id": "input_b", "type": "int32", "default": 0},
10+
{"id": "result", "type": "int32", "default": 0}
11+
],
12+
"rules": [
13+
{
14+
"id": "rule.compute",
15+
"class": "inference",
16+
"then": {
17+
"compute": [
18+
{"target": "result", "op": "scale", "left": "input_a", "right": "input_b", "scale": 1000}
19+
]
20+
}
21+
}
22+
],
23+
"actions": [],
24+
"modes": []
25+
},
26+
"facts": {"input_a": 5000, "input_b": 2500, "result": 0},
27+
"timestamps": {},
28+
"snapshot_timestamp_ms": 0,
29+
"expected": {
30+
"fired_rules": ["rule.compute"],
31+
"current_mode": null,
32+
"raised_faults": [],
33+
"requested_actions": [],
34+
"fact_values": {"input_a": 5000, "input_b": 2500, "result": 12500}
35+
}
36+
}

0 commit comments

Comments
 (0)