Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ jobs:
--inline-logs -v \
-O twister-out/benchmarks

# Parse benchmark timing and print summary table.
# No fail threshold yet — we need baseline data first.
- name: Benchmark timing summary
if: always()
run: python app/tools/parse_benchmark.py twister-out/benchmarks/

# All 17 samples are build_only; CI proves they compile clean.
- name: Twister — samples
run: |
Expand Down
196 changes: 196 additions & 0 deletions tests/python/test_cross_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# SPDX-License-Identifier: MIT
"""Cross-validation harness — proves Python evaluator matches golden vectors.

Each vector in tests/vectors/<name>/vector.json contains an inline ARB model,
input facts/timestamps, and expected outputs. The Python evaluator is the
reference implementation; these vectors will also be consumed by the C engine
under Zephyr to prove cross-platform equivalence.

Tests:
1. Parametrised golden-vector evaluation (10+ vectors).
2. Determinism: same input, 100 runs → identical output.
3. Compile-to-C: each vector model compiles and the generated source
contains the required ARBITER_generated_model symbol.
"""

from __future__ import annotations

import json
import tempfile
from pathlib import Path

import pytest

from arbiter.compiler import CompileOptions, compile_model
from arbiter.evaluator import ArbiterEvaluator

VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _discover_vectors() -> list[str]:
"""Return sorted list of vector directory names that contain vector.json."""
if not VECTORS_DIR.exists():
return []
return sorted(
d.name
for d in VECTORS_DIR.iterdir()
if d.is_dir() and (d / "vector.json").exists()
)


def _load_vector(name: str) -> dict:
"""Load and parse a vector.json file."""
path = VECTORS_DIR / name / "vector.json"
return json.loads(path.read_text(encoding="utf-8"))


def _run_vector(vec: dict) -> tuple[ArbiterEvaluator, dict]:
"""Run the Python evaluator on a vector and return (evaluator, result_dict)."""
model_data = vec["model"]
ev = ArbiterEvaluator(model_data)

# Set fact values
for fact_name, value in vec.get("facts", {}).items():
ev.set_fact(fact_name, value)

# Set timestamps
for fact_name, ms in vec.get("timestamps", {}).items():
ev.set_timestamp(fact_name, ms)

# Set snapshot timestamp
snap_ts = vec.get("snapshot_timestamp_ms", 0)
if snap_ts:
ev.set_snapshot_timestamp(snap_ts)

result = ev.eval()
return ev, result


# ---------------------------------------------------------------------------
# 1. Golden vector evaluation
# ---------------------------------------------------------------------------

_VECTOR_NAMES = _discover_vectors()


@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
def test_golden_vector(vector_name: str) -> None:
"""Evaluate each golden vector and assert output matches expected."""
if vector_name == "_no_vectors_":
pytest.fail("No golden vectors found in tests/vectors/")

vec = _load_vector(vector_name)
expected = vec["expected"]

ev, result = _run_vector(vec)

# --- fired_rules: exact ordered list ---
assert result.fired_rules == expected["fired_rules"], (
f"[{vector_name}] fired_rules mismatch"
)

# --- current_mode ---
assert result.current_mode == expected.get("current_mode"), (
f"[{vector_name}] current_mode mismatch"
)

# --- raised_faults: sorted set comparison ---
assert sorted(result.raised_faults) == sorted(expected.get("raised_faults", [])), (
f"[{vector_name}] raised_faults mismatch"
)

# --- requested_actions: ordered list ---
assert result.requested_actions == expected.get("requested_actions", []), (
f"[{vector_name}] requested_actions mismatch"
)

# --- fact_values: spot-check only the facts listed in expected ---
expected_facts = expected.get("fact_values", {})
for fact_name, expected_val in expected_facts.items():
actual = ev._fact_values.get(fact_name)
assert actual == expected_val, (
f"[{vector_name}] fact {fact_name}: expected {expected_val}, got {actual}"
)


# ---------------------------------------------------------------------------
# 2. Determinism — same input, 100 runs, identical output
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("vector_name", _VECTOR_NAMES[:3] or ["_no_vectors_"])
def test_determinism(vector_name: str) -> None:
"""Run the same vector 100 times and assert all outputs are identical."""
if vector_name == "_no_vectors_":
pytest.skip("No vectors for determinism test")

vec = _load_vector(vector_name)
results: list[dict] = []

for _ in range(100):
_, result = _run_vector(vec)
results.append(result.to_dict())

baseline = results[0]
for i, r in enumerate(results[1:], start=1):
assert r == baseline, (
f"[{vector_name}] Non-deterministic result on iteration {i}"
)


# ---------------------------------------------------------------------------
# 3. Compile-to-C — verify each vector model compiles to valid C source
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
def test_compile_to_c(vector_name: str) -> None:
"""Compile each vector model to C and verify the source contains required symbols."""
if vector_name == "_no_vectors_":
pytest.skip("No vectors for compile test")

vec = _load_vector(vector_name)
model_data = vec["model"]

with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
# Write model as YAML for the compiler
import yaml

model_path = tmp / "model.arb.yaml"
model_path.write_text(
yaml.dump(model_data, default_flow_style=False), encoding="utf-8"
)

opts = CompileOptions(
out_c=tmp / "model.c",
out_h=tmp / "model.h",
)
result = compile_model(model_path, opts)

assert result.success, (
f"[{vector_name}] Compilation failed: "
+ "; ".join(
d.message
for d in result.diagnostics.errors
)
)

# Verify generated C source contains required symbols
c_source = (tmp / "model.c").read_text(encoding="utf-8")
h_source = (tmp / "model.h").read_text(encoding="utf-8")

assert "ARBITER_generated_model" in c_source, (
f"[{vector_name}] Missing ARBITER_generated_model in C source"
)
assert "ARBITER_generated_model" in h_source, (
f"[{vector_name}] Missing ARBITER_generated_model in header"
)
assert "ARBITER_MODEL_HASH" in h_source, (
f"[{vector_name}] Missing ARBITER_MODEL_HASH in header"
)
39 changes: 27 additions & 12 deletions tests/python/test_golden_vectors.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,58 @@
# SPDX-License-Identifier: MIT
"""Golden vector tests — framework for verifying deterministic evaluation.

Each model in tests/vectors/ should include:
- input_snapshot.json
- expected_result.json
- expected_trace.json
Each subdirectory under tests/vectors/ contains a vector.json with an inline
ARB model, input facts/timestamps, and expected results.

The same vectors are tested by:
- Python reference evaluator (this file)
- Python reference evaluator (this file + test_cross_validation.py)
- Generated C runtime under Zephyr
- Blob runtime under Zephyr

NOTE: The comprehensive cross-validation tests are in test_cross_validation.py.
This file is kept for backwards compatibility with the original vector
discovery mechanism.
"""

import json
from pathlib import Path

import pytest

from arbiter.evaluator import ArbiterEvaluator

VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"


def _load_vectors():
"""Discover and load golden vector test cases."""
"""Discover golden vector test cases using vector.json format."""
if not VECTORS_DIR.exists():
return []
vectors = []
for d in sorted(VECTORS_DIR.iterdir()):
if d.is_dir() and (d / "input_snapshot.json").exists():
if d.is_dir() and (d / "vector.json").exists():
vectors.append(d.name)
return vectors


@pytest.mark.parametrize("vector_name", _load_vectors() or ["placeholder"])
def test_golden_vector(vector_name):
"""Verify golden vector produces expected result."""
"""Verify golden vector produces expected result via Python evaluator."""
if vector_name == "placeholder":
pytest.skip("No golden vectors yet — add to tests/vectors/")
vector_dir = VECTORS_DIR / vector_name
input_snap = json.loads((vector_dir / "input_snapshot.json").read_text())
expected = json.loads((vector_dir / "expected_result.json").read_text())
# TODO: implement Python reference evaluator and compare
pytest.skip("Python reference evaluator not yet implemented")
vec = json.loads((vector_dir / "vector.json").read_text(encoding="utf-8"))
model_data = vec["model"]
expected = vec["expected"]

ev = ArbiterEvaluator(model_data)
for fact_name, value in vec.get("facts", {}).items():
ev.set_fact(fact_name, value)
for fact_name, ms in vec.get("timestamps", {}).items():
ev.set_timestamp(fact_name, ms)
snap_ts = vec.get("snapshot_timestamp_ms", 0)
if snap_ts:
ev.set_snapshot_timestamp(snap_ts)

result = ev.eval()
assert result.fired_rules == expected["fired_rules"]
30 changes: 30 additions & 0 deletions tests/vectors/01_basic_eval/vector.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"description": "Basic evaluation: unconditional rule always fires",
"model": {
"arb_version": 0.1,
"model": "vec_basic_eval",
"target": {"rtos": "zephyr"},
"facts": [
{"id": "sensor.value", "type": "int32", "default": 0}
],
"rules": [
{
"id": "rule.always_on",
"class": "inference",
"then": {"explanation": "Unconditional rule fires every tick"}
}
],
"actions": [],
"modes": []
},
"facts": {"sensor.value": 42},
"timestamps": {},
"snapshot_timestamp_ms": 0,
"expected": {
"fired_rules": ["rule.always_on"],
"current_mode": null,
"raised_faults": [],
"requested_actions": [],
"fact_values": {"sensor.value": 42}
}
}
40 changes: 40 additions & 0 deletions tests/vectors/02_safety_guard_ordering/vector.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"description": "Safety guard ordering: safety_guard fires before inference regardless of declaration order",
"model": {
"arb_version": 0.1,
"model": "vec_safety_ordering",
"target": {"rtos": "zephyr"},
"facts": [
{"id": "temp_c", "type": "int32", "default": 0}
],
"rules": [
{
"id": "rule.inference_first_alpha",
"class": "inference",
"then": {"explanation": "Inference rule"}
},
{
"id": "rule.advisory_z",
"class": "advisory",
"then": {"explanation": "Advisory rule"}
},
{
"id": "rule.safety_override",
"class": "safety_guard",
"then": {"explanation": "Safety guard fires first"}
}
],
"actions": [],
"modes": []
},
"facts": {"temp_c": 100},
"timestamps": {},
"snapshot_timestamp_ms": 0,
"expected": {
"fired_rules": ["rule.safety_override", "rule.inference_first_alpha", "rule.advisory_z"],
"current_mode": null,
"raised_faults": [],
"requested_actions": [],
"fact_values": {"temp_c": 100}
}
}
36 changes: 36 additions & 0 deletions tests/vectors/03_expr_scale/vector.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"description": "Expression opcode: scale — target = (left * right) / scale",
"model": {
"arb_version": 0.1,
"model": "vec_expr_scale",
"target": {"rtos": "zephyr"},
"facts": [
{"id": "input_a", "type": "int32", "default": 0},
{"id": "input_b", "type": "int32", "default": 0},
{"id": "result", "type": "int32", "default": 0}
],
"rules": [
{
"id": "rule.compute",
"class": "inference",
"then": {
"compute": [
{"target": "result", "op": "scale", "left": "input_a", "right": "input_b", "scale": 1000}
]
}
}
],
"actions": [],
"modes": []
},
"facts": {"input_a": 5000, "input_b": 2500, "result": 0},
"timestamps": {},
"snapshot_timestamp_ms": 0,
"expected": {
"fired_rules": ["rule.compute"],
"current_mode": null,
"raised_faults": [],
"requested_actions": [],
"fact_values": {"input_a": 5000, "input_b": 2500, "result": 12500}
}
}
Loading
Loading