BitConcepts · tbitcs · Jun 2, 2026 · Jun 2, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -243,6 +243,12 @@ jobs:
             --inline-logs -v \
             -O twister-out/benchmarks
 
+      # Parse benchmark timing and print summary table.
+      # No fail threshold yet — we need baseline data first.
+      - name: Benchmark timing summary
+        if: always()
+        run: python app/tools/parse_benchmark.py twister-out/benchmarks/
+
       # All 17 samples are build_only; CI proves they compile clean.
       - name: Twister — samples
         run: |

diff --git a/tests/python/test_cross_validation.py b/tests/python/test_cross_validation.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: MIT
+"""Cross-validation harness — proves Python evaluator matches golden vectors.
+
+Each vector in tests/vectors/<name>/vector.json contains an inline ARB model,
+input facts/timestamps, and expected outputs.  The Python evaluator is the
+reference implementation; these vectors will also be consumed by the C engine
+under Zephyr to prove cross-platform equivalence.
+
+Tests:
+  1. Parametrised golden-vector evaluation (10+ vectors).
+  2. Determinism: same input, 100 runs → identical output.
+  3. Compile-to-C: each vector model compiles and the generated source
+     contains the required ARBITER_generated_model symbol.
+"""
+
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from arbiter.compiler import CompileOptions, compile_model
+from arbiter.evaluator import ArbiterEvaluator
+
+VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _discover_vectors() -> list[str]:
+    """Return sorted list of vector directory names that contain vector.json."""
+    if not VECTORS_DIR.exists():
+        return []
+    return sorted(
+        d.name
+        for d in VECTORS_DIR.iterdir()
+        if d.is_dir() and (d / "vector.json").exists()
+    )
+
+
+def _load_vector(name: str) -> dict:
+    """Load and parse a vector.json file."""
+    path = VECTORS_DIR / name / "vector.json"
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _run_vector(vec: dict) -> tuple[ArbiterEvaluator, dict]:
+    """Run the Python evaluator on a vector and return (evaluator, result_dict)."""
+    model_data = vec["model"]
+    ev = ArbiterEvaluator(model_data)
+
+    # Set fact values
+    for fact_name, value in vec.get("facts", {}).items():
+        ev.set_fact(fact_name, value)
+
+    # Set timestamps
+    for fact_name, ms in vec.get("timestamps", {}).items():
+        ev.set_timestamp(fact_name, ms)
+
+    # Set snapshot timestamp
+    snap_ts = vec.get("snapshot_timestamp_ms", 0)
+    if snap_ts:
+        ev.set_snapshot_timestamp(snap_ts)
+
+    result = ev.eval()
+    return ev, result
+
+
+# ---------------------------------------------------------------------------
+# 1. Golden vector evaluation
+# ---------------------------------------------------------------------------
+
+_VECTOR_NAMES = _discover_vectors()
+
+
+@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
+def test_golden_vector(vector_name: str) -> None:
+    """Evaluate each golden vector and assert output matches expected."""
+    if vector_name == "_no_vectors_":
+        pytest.fail("No golden vectors found in tests/vectors/")
+
+    vec = _load_vector(vector_name)
+    expected = vec["expected"]
+
+    ev, result = _run_vector(vec)
+
+    # --- fired_rules: exact ordered list ---
+    assert result.fired_rules == expected["fired_rules"], (
+        f"[{vector_name}] fired_rules mismatch"
+    )
+
+    # --- current_mode ---
+    assert result.current_mode == expected.get("current_mode"), (
+        f"[{vector_name}] current_mode mismatch"
+    )
+
+    # --- raised_faults: sorted set comparison ---
+    assert sorted(result.raised_faults) == sorted(expected.get("raised_faults", [])), (
+        f"[{vector_name}] raised_faults mismatch"
+    )
+
+    # --- requested_actions: ordered list ---
+    assert result.requested_actions == expected.get("requested_actions", []), (
+        f"[{vector_name}] requested_actions mismatch"
+    )
+
+    # --- fact_values: spot-check only the facts listed in expected ---
+    expected_facts = expected.get("fact_values", {})
+    for fact_name, expected_val in expected_facts.items():
+        actual = ev._fact_values.get(fact_name)
+        assert actual == expected_val, (
+            f"[{vector_name}] fact {fact_name}: expected {expected_val}, got {actual}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 2. Determinism — same input, 100 runs, identical output
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("vector_name", _VECTOR_NAMES[:3] or ["_no_vectors_"])
+def test_determinism(vector_name: str) -> None:
+    """Run the same vector 100 times and assert all outputs are identical."""
+    if vector_name == "_no_vectors_":
+        pytest.skip("No vectors for determinism test")
+
+    vec = _load_vector(vector_name)
+    results: list[dict] = []
+
+    for _ in range(100):
+        _, result = _run_vector(vec)
+        results.append(result.to_dict())
+
+    baseline = results[0]
+    for i, r in enumerate(results[1:], start=1):
+        assert r == baseline, (
+            f"[{vector_name}] Non-deterministic result on iteration {i}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 3. Compile-to-C — verify each vector model compiles to valid C source
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("vector_name", _VECTOR_NAMES or ["_no_vectors_"])
+def test_compile_to_c(vector_name: str) -> None:
+    """Compile each vector model to C and verify the source contains required symbols."""
+    if vector_name == "_no_vectors_":
+        pytest.skip("No vectors for compile test")
+
+    vec = _load_vector(vector_name)
+    model_data = vec["model"]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmp = Path(tmpdir)
+        # Write model as YAML for the compiler
+        import yaml
+
+        model_path = tmp / "model.arb.yaml"
+        model_path.write_text(
+            yaml.dump(model_data, default_flow_style=False), encoding="utf-8"
+        )
+
+        opts = CompileOptions(
+            out_c=tmp / "model.c",
+            out_h=tmp / "model.h",
+        )
+        result = compile_model(model_path, opts)
+
+        assert result.success, (
+            f"[{vector_name}] Compilation failed: "
+            + "; ".join(
+                d.message
+                for d in result.diagnostics.errors
+            )
+        )
+
+        # Verify generated C source contains required symbols
+        c_source = (tmp / "model.c").read_text(encoding="utf-8")
+        h_source = (tmp / "model.h").read_text(encoding="utf-8")
+
+        assert "ARBITER_generated_model" in c_source, (
+            f"[{vector_name}] Missing ARBITER_generated_model in C source"
+        )
+        assert "ARBITER_generated_model" in h_source, (
+            f"[{vector_name}] Missing ARBITER_generated_model in header"
+        )
+        assert "ARBITER_MODEL_HASH" in h_source, (
+            f"[{vector_name}] Missing ARBITER_MODEL_HASH in header"
+        )
diff --git a/tests/python/test_golden_vectors.py b/tests/python/test_golden_vectors.py
@@ -1,43 +1,58 @@
 # SPDX-License-Identifier: MIT
 """Golden vector tests — framework for verifying deterministic evaluation.
 
-Each model in tests/vectors/ should include:
-  - input_snapshot.json
-  - expected_result.json
-  - expected_trace.json
+Each subdirectory under tests/vectors/ contains a vector.json with an inline
+ARB model, input facts/timestamps, and expected results.
 
 The same vectors are tested by:
-  - Python reference evaluator (this file)
+  - Python reference evaluator (this file + test_cross_validation.py)
   - Generated C runtime under Zephyr
   - Blob runtime under Zephyr
+
+NOTE: The comprehensive cross-validation tests are in test_cross_validation.py.
+This file is kept for backwards compatibility with the original vector
+discovery mechanism.
 """
 
 import json
 from pathlib import Path
 
 import pytest
 
+from arbiter.evaluator import ArbiterEvaluator
+
 VECTORS_DIR = Path(__file__).resolve().parent.parent / "vectors"
 
 
 def _load_vectors():
-    """Discover and load golden vector test cases."""
+    """Discover golden vector test cases using vector.json format."""
     if not VECTORS_DIR.exists():
         return []
     vectors = []
     for d in sorted(VECTORS_DIR.iterdir()):
-        if d.is_dir() and (d / "input_snapshot.json").exists():
+        if d.is_dir() and (d / "vector.json").exists():
             vectors.append(d.name)
     return vectors
 
 
 @pytest.mark.parametrize("vector_name", _load_vectors() or ["placeholder"])
 def test_golden_vector(vector_name):
-    """Verify golden vector produces expected result."""
+    """Verify golden vector produces expected result via Python evaluator."""
     if vector_name == "placeholder":
         pytest.skip("No golden vectors yet — add to tests/vectors/")
     vector_dir = VECTORS_DIR / vector_name
-    input_snap = json.loads((vector_dir / "input_snapshot.json").read_text())
-    expected = json.loads((vector_dir / "expected_result.json").read_text())
-    # TODO: implement Python reference evaluator and compare
-    pytest.skip("Python reference evaluator not yet implemented")
+    vec = json.loads((vector_dir / "vector.json").read_text(encoding="utf-8"))
+    model_data = vec["model"]
+    expected = vec["expected"]
+
+    ev = ArbiterEvaluator(model_data)
+    for fact_name, value in vec.get("facts", {}).items():
+        ev.set_fact(fact_name, value)
+    for fact_name, ms in vec.get("timestamps", {}).items():
+        ev.set_timestamp(fact_name, ms)
+    snap_ts = vec.get("snapshot_timestamp_ms", 0)
+    if snap_ts:
+        ev.set_snapshot_timestamp(snap_ts)
+
+    result = ev.eval()
+    assert result.fired_rules == expected["fired_rules"]
diff --git a/tests/vectors/01_basic_eval/vector.json b/tests/vectors/01_basic_eval/vector.json
@@ -0,0 +1,30 @@
+{
+  "description": "Basic evaluation: unconditional rule always fires",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_basic_eval",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "sensor.value", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.always_on",
+        "class": "inference",
+        "then": {"explanation": "Unconditional rule fires every tick"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"sensor.value": 42},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.always_on"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"sensor.value": 42}
+  }
+}
diff --git a/tests/vectors/02_safety_guard_ordering/vector.json b/tests/vectors/02_safety_guard_ordering/vector.json
@@ -0,0 +1,40 @@
+{
+  "description": "Safety guard ordering: safety_guard fires before inference regardless of declaration order",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_safety_ordering",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "temp_c", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.inference_first_alpha",
+        "class": "inference",
+        "then": {"explanation": "Inference rule"}
+      },
+      {
+        "id": "rule.advisory_z",
+        "class": "advisory",
+        "then": {"explanation": "Advisory rule"}
+      },
+      {
+        "id": "rule.safety_override",
+        "class": "safety_guard",
+        "then": {"explanation": "Safety guard fires first"}
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"temp_c": 100},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.safety_override", "rule.inference_first_alpha", "rule.advisory_z"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"temp_c": 100}
+  }
+}
diff --git a/tests/vectors/03_expr_scale/vector.json b/tests/vectors/03_expr_scale/vector.json
@@ -0,0 +1,36 @@
+{
+  "description": "Expression opcode: scale — target = (left * right) / scale",
+  "model": {
+    "arb_version": 0.1,
+    "model": "vec_expr_scale",
+    "target": {"rtos": "zephyr"},
+    "facts": [
+      {"id": "input_a", "type": "int32", "default": 0},
+      {"id": "input_b", "type": "int32", "default": 0},
+      {"id": "result", "type": "int32", "default": 0}
+    ],
+    "rules": [
+      {
+        "id": "rule.compute",
+        "class": "inference",
+        "then": {
+          "compute": [
+            {"target": "result", "op": "scale", "left": "input_a", "right": "input_b", "scale": 1000}
+          ]
+        }
+      }
+    ],
+    "actions": [],
+    "modes": []
+  },
+  "facts": {"input_a": 5000, "input_b": 2500, "result": 0},
+  "timestamps": {},
+  "snapshot_timestamp_ms": 0,
+  "expected": {
+    "fired_rules": ["rule.compute"],
+    "current_mode": null,
+    "raised_faults": [],
+    "requested_actions": [],
+    "fact_values": {"input_a": 5000, "input_b": 2500, "result": 12500}
+  }
+}