Skip to content

Commit 5495621

Browse files
authored
Add benchmark models for IX-HapticSight
This module defines backend-agnostic scenario, metric, and result structures for deterministic repository benchmarks, supporting various benchmark types and ensuring explicit inputs and outcomes.
1 parent 493f055 commit 5495621

1 file changed

Lines changed: 225 additions & 0 deletions

File tree

src/ohip_bench/models.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
"""
2+
Benchmark models for IX-HapticSight.
3+
4+
This module defines backend-agnostic scenario, metric, and result structures
5+
for deterministic repository benchmarks. These models are intended to support:
6+
7+
- consent-path benchmarks
8+
- safety-veto benchmarks
9+
- planning/execution benchmarks
10+
- replay-backed regression checks
11+
12+
Design goals:
13+
- explicit inputs
14+
- explicit expected outcomes
15+
- structured measurable outputs
16+
- no hidden dependence on one runtime transport
17+
"""
18+
19+
from __future__ import annotations
20+
21+
from dataclasses import dataclass, field
22+
from enum import Enum
23+
from time import time
24+
from typing import Any, Optional
25+
26+
27+
class BenchmarkDomain(str, Enum):
28+
CONSENT = "CONSENT"
29+
SAFETY = "SAFETY"
30+
PLANNING = "PLANNING"
31+
EXECUTION = "EXECUTION"
32+
LOGGING = "LOGGING"
33+
REPLAY = "REPLAY"
34+
INTEGRATION = "INTEGRATION"
35+
36+
37+
class BenchmarkOutcome(str, Enum):
38+
PASS = "PASS"
39+
FAIL = "FAIL"
40+
ERROR = "ERROR"
41+
SKIPPED = "SKIPPED"
42+
43+
44+
@dataclass(frozen=True)
45+
class BenchmarkMetric:
46+
"""
47+
One measured metric from a benchmark run.
48+
"""
49+
50+
name: str
51+
value: float
52+
unit: str = ""
53+
note: str = ""
54+
55+
def to_dict(self) -> dict[str, Any]:
56+
return {
57+
"name": self.name,
58+
"value": float(self.value),
59+
"unit": self.unit,
60+
"note": self.note,
61+
}
62+
63+
64+
@dataclass(frozen=True)
65+
class BenchmarkExpectation:
66+
"""
67+
Explicit expected outcome for one benchmark scenario.
68+
69+
This is intentionally narrow and machine-friendly so that benchmark logic
70+
can compare real outputs against stated expectations without relying on
71+
vague prose.
72+
"""
73+
74+
expected_status: str
75+
expected_executable: Optional[bool] = None
76+
expected_fault_reason: str = ""
77+
expected_execution_status: str = ""
78+
79+
def to_dict(self) -> dict[str, Any]:
80+
return {
81+
"expected_status": self.expected_status,
82+
"expected_executable": self.expected_executable,
83+
"expected_fault_reason": self.expected_fault_reason,
84+
"expected_execution_status": self.expected_execution_status,
85+
}
86+
87+
88+
@dataclass(frozen=True)
89+
class BenchmarkScenario:
90+
"""
91+
Canonical scenario definition for one deterministic benchmark case.
92+
93+
`inputs` is intentionally a plain mapping so the benchmark package can
94+
describe scenarios without importing every runtime model eagerly.
95+
"""
96+
97+
scenario_id: str
98+
title: str
99+
domain: BenchmarkDomain
100+
description: str
101+
inputs: dict[str, Any]
102+
expectation: BenchmarkExpectation
103+
tags: tuple[str, ...] = ()
104+
created_at_utc_s: float = field(default_factory=time)
105+
106+
def to_dict(self) -> dict[str, Any]:
107+
return {
108+
"scenario_id": self.scenario_id,
109+
"title": self.title,
110+
"domain": self.domain.value,
111+
"description": self.description,
112+
"inputs": dict(self.inputs),
113+
"expectation": self.expectation.to_dict(),
114+
"tags": list(self.tags),
115+
"created_at_utc_s": float(self.created_at_utc_s),
116+
}
117+
118+
119+
@dataclass(frozen=True)
120+
class BenchmarkObservation:
121+
"""
122+
Observed structured outcome from one benchmark run.
123+
"""
124+
125+
observed_status: str
126+
observed_executable: Optional[bool] = None
127+
observed_fault_reason: str = ""
128+
observed_execution_status: str = ""
129+
event_count: int = 0
130+
131+
def to_dict(self) -> dict[str, Any]:
132+
return {
133+
"observed_status": self.observed_status,
134+
"observed_executable": self.observed_executable,
135+
"observed_fault_reason": self.observed_fault_reason,
136+
"observed_execution_status": self.observed_execution_status,
137+
"event_count": int(self.event_count),
138+
}
139+
140+
141+
@dataclass(frozen=True)
142+
class BenchmarkResult:
143+
"""
144+
Structured result for one executed benchmark scenario.
145+
"""
146+
147+
scenario_id: str
148+
domain: BenchmarkDomain
149+
outcome: BenchmarkOutcome
150+
observation: BenchmarkObservation
151+
metrics: tuple[BenchmarkMetric, ...] = ()
152+
reason_code: str = ""
153+
started_at_utc_s: float = field(default_factory=time)
154+
finished_at_utc_s: float = field(default_factory=time)
155+
156+
@property
157+
def duration_ms(self) -> float:
158+
return max(0.0, (float(self.finished_at_utc_s) - float(self.started_at_utc_s)) * 1000.0)
159+
160+
def to_dict(self) -> dict[str, Any]:
161+
return {
162+
"scenario_id": self.scenario_id,
163+
"domain": self.domain.value,
164+
"outcome": self.outcome.value,
165+
"observation": self.observation.to_dict(),
166+
"metrics": [metric.to_dict() for metric in self.metrics],
167+
"reason_code": self.reason_code,
168+
"started_at_utc_s": float(self.started_at_utc_s),
169+
"finished_at_utc_s": float(self.finished_at_utc_s),
170+
"duration_ms": float(self.duration_ms),
171+
}
172+
173+
174+
def compare_expectation(
175+
*,
176+
expectation: BenchmarkExpectation,
177+
observation: BenchmarkObservation,
178+
) -> tuple[bool, str]:
179+
"""
180+
Compare one observed benchmark outcome against its explicit expectation.
181+
182+
Returns:
183+
- success flag
184+
- compact reason string
185+
"""
186+
if expectation.expected_status != observation.observed_status:
187+
return False, (
188+
f"status_mismatch:"
189+
f"{expectation.expected_status}!={observation.observed_status}"
190+
)
191+
192+
if expectation.expected_executable is not None:
193+
if expectation.expected_executable != observation.observed_executable:
194+
return False, (
195+
f"executable_mismatch:"
196+
f"{expectation.expected_executable}!={observation.observed_executable}"
197+
)
198+
199+
if expectation.expected_fault_reason:
200+
if expectation.expected_fault_reason != observation.observed_fault_reason:
201+
return False, (
202+
f"fault_reason_mismatch:"
203+
f"{expectation.expected_fault_reason}!={observation.observed_fault_reason}"
204+
)
205+
206+
if expectation.expected_execution_status:
207+
if expectation.expected_execution_status != observation.observed_execution_status:
208+
return False, (
209+
f"execution_status_mismatch:"
210+
f"{expectation.expected_execution_status}!={observation.observed_execution_status}"
211+
)
212+
213+
return True, "expectation_met"
214+
215+
216+
__all__ = [
217+
"BenchmarkDomain",
218+
"BenchmarkOutcome",
219+
"BenchmarkMetric",
220+
"BenchmarkExpectation",
221+
"BenchmarkScenario",
222+
"BenchmarkObservation",
223+
"BenchmarkResult",
224+
"compare_expectation",
225+
]

0 commit comments

Comments
 (0)