Skip to content

Commit 2887f2c

Browse files
authored
Add tool ordering replay artifact
Add a deterministic tool-ordering replay artifact generator and committed artifact for manifest-registered fixture ordering evidence. Scope remains limited to tool-ordering artifact generation and tests; no fixture payloads, README, workflows, runtime/orchestration behavior, or new failure labels changed.
1 parent 5213a8c commit 2887f2c

3 files changed

Lines changed: 728 additions & 0 deletions

File tree

Lines changed: 371 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
{
2+
"artifact_id": "tool_ordering_replay_results_v1",
3+
"generated_by": "ToolOrderingReplayArtifactGenerator",
4+
"version": "1.0",
5+
"evaluation_mode": "deterministic",
6+
"llm_judges": "none",
7+
"external_apis": "none",
8+
"families": [
9+
{
10+
"family": "coding_workflow_pr_review",
11+
"fixtures": [
12+
{
13+
"fixture_id": "coding_workflow_pr_review_degraded_v1",
14+
"degradation_level": "severe",
15+
"expected_admissible": false,
16+
"expected_failure_labels": [
17+
"CAUSAL_DEPENDENCY_LOSS",
18+
"INVARIANT_VIOLATION",
19+
"POLICY_ORDER_BROKEN",
20+
"RECOVERY_PATH_INVALID"
21+
],
22+
"tool_ordering": {
23+
"original_edge_count": 0,
24+
"replay_edge_count": 0,
25+
"missing_edges": [],
26+
"added_edges": [],
27+
"original_node_count": 0,
28+
"replay_node_count": 0,
29+
"missing_nodes": [],
30+
"added_nodes": [],
31+
"required_before_violations": [],
32+
"drift_detected": false
33+
}
34+
},
35+
{
36+
"fixture_id": "coding_workflow_pr_review_mild_v1",
37+
"degradation_level": "mild",
38+
"expected_admissible": false,
39+
"expected_failure_labels": [
40+
"RECOVERY_PATH_INVALID"
41+
],
42+
"tool_ordering": {
43+
"original_edge_count": 0,
44+
"replay_edge_count": 0,
45+
"missing_edges": [],
46+
"added_edges": [],
47+
"original_node_count": 0,
48+
"replay_node_count": 0,
49+
"missing_nodes": [],
50+
"added_nodes": [],
51+
"required_before_violations": [],
52+
"drift_detected": false
53+
}
54+
},
55+
{
56+
"fixture_id": "coding_workflow_pr_review_moderate_v1",
57+
"degradation_level": "moderate",
58+
"expected_admissible": false,
59+
"expected_failure_labels": [
60+
"CAUSAL_DEPENDENCY_LOSS",
61+
"RECOVERY_PATH_INVALID"
62+
],
63+
"tool_ordering": {
64+
"original_edge_count": 0,
65+
"replay_edge_count": 0,
66+
"missing_edges": [],
67+
"added_edges": [],
68+
"original_node_count": 0,
69+
"replay_node_count": 0,
70+
"missing_nodes": [],
71+
"added_nodes": [],
72+
"required_before_violations": [],
73+
"drift_detected": false
74+
}
75+
},
76+
{
77+
"fixture_id": "coding_workflow_pr_review_v1",
78+
"degradation_level": "baseline",
79+
"expected_admissible": true,
80+
"expected_failure_labels": [],
81+
"tool_ordering": {
82+
"original_edge_count": 0,
83+
"replay_edge_count": 0,
84+
"missing_edges": [],
85+
"added_edges": [],
86+
"original_node_count": 0,
87+
"replay_node_count": 0,
88+
"missing_nodes": [],
89+
"added_nodes": [],
90+
"required_before_violations": [],
91+
"drift_detected": false
92+
}
93+
}
94+
]
95+
},
96+
{
97+
"family": "cross_domain_operational_dependency_workflow",
98+
"fixtures": [
99+
{
100+
"fixture_id": "cross_domain_operational_dependency_workflow_degraded_v1",
101+
"degradation_level": "severe",
102+
"expected_admissible": false,
103+
"expected_failure_labels": [
104+
"CAUSAL_DEPENDENCY_LOSS",
105+
"INVARIANT_VIOLATION",
106+
"POLICY_ORDER_BROKEN",
107+
"RECOVERY_PATH_INVALID"
108+
],
109+
"tool_ordering": {
110+
"original_edge_count": 0,
111+
"replay_edge_count": 0,
112+
"missing_edges": [],
113+
"added_edges": [],
114+
"original_node_count": 0,
115+
"replay_node_count": 0,
116+
"missing_nodes": [],
117+
"added_nodes": [],
118+
"required_before_violations": [],
119+
"drift_detected": false
120+
}
121+
},
122+
{
123+
"fixture_id": "cross_domain_operational_dependency_workflow_mild_v1",
124+
"degradation_level": "mild",
125+
"expected_admissible": false,
126+
"expected_failure_labels": [
127+
"INVARIANT_VIOLATION",
128+
"RECOVERY_PATH_INVALID"
129+
],
130+
"tool_ordering": {
131+
"original_edge_count": 0,
132+
"replay_edge_count": 0,
133+
"missing_edges": [],
134+
"added_edges": [],
135+
"original_node_count": 0,
136+
"replay_node_count": 0,
137+
"missing_nodes": [],
138+
"added_nodes": [],
139+
"required_before_violations": [],
140+
"drift_detected": false
141+
}
142+
},
143+
{
144+
"fixture_id": "cross_domain_operational_dependency_workflow_moderate_v1",
145+
"degradation_level": "moderate",
146+
"expected_admissible": false,
147+
"expected_failure_labels": [
148+
"CAUSAL_DEPENDENCY_LOSS",
149+
"INVARIANT_VIOLATION",
150+
"RECOVERY_PATH_INVALID"
151+
],
152+
"tool_ordering": {
153+
"original_edge_count": 0,
154+
"replay_edge_count": 0,
155+
"missing_edges": [],
156+
"added_edges": [],
157+
"original_node_count": 0,
158+
"replay_node_count": 0,
159+
"missing_nodes": [],
160+
"added_nodes": [],
161+
"required_before_violations": [],
162+
"drift_detected": false
163+
}
164+
},
165+
{
166+
"fixture_id": "cross_domain_operational_dependency_workflow_v1",
167+
"degradation_level": "baseline",
168+
"expected_admissible": true,
169+
"expected_failure_labels": [],
170+
"tool_ordering": {
171+
"original_edge_count": 0,
172+
"replay_edge_count": 0,
173+
"missing_edges": [],
174+
"added_edges": [],
175+
"original_node_count": 0,
176+
"replay_node_count": 0,
177+
"missing_nodes": [],
178+
"added_nodes": [],
179+
"required_before_violations": [],
180+
"drift_detected": false
181+
}
182+
}
183+
]
184+
},
185+
{
186+
"family": "incident_response_page_triage",
187+
"fixtures": [
188+
{
189+
"fixture_id": "incident_response_page_triage_degraded_v1",
190+
"degradation_level": "severe",
191+
"expected_admissible": false,
192+
"expected_failure_labels": [
193+
"CAUSAL_DEPENDENCY_LOSS",
194+
"INVARIANT_VIOLATION",
195+
"POLICY_ORDER_BROKEN",
196+
"RECOVERY_PATH_INVALID"
197+
],
198+
"tool_ordering": {
199+
"original_edge_count": 0,
200+
"replay_edge_count": 0,
201+
"missing_edges": [],
202+
"added_edges": [],
203+
"original_node_count": 0,
204+
"replay_node_count": 0,
205+
"missing_nodes": [],
206+
"added_nodes": [],
207+
"required_before_violations": [],
208+
"drift_detected": false
209+
}
210+
},
211+
{
212+
"fixture_id": "incident_response_page_triage_mild_v1",
213+
"degradation_level": "mild",
214+
"expected_admissible": false,
215+
"expected_failure_labels": [
216+
"RECOVERY_PATH_INVALID"
217+
],
218+
"tool_ordering": {
219+
"original_edge_count": 0,
220+
"replay_edge_count": 0,
221+
"missing_edges": [],
222+
"added_edges": [],
223+
"original_node_count": 0,
224+
"replay_node_count": 0,
225+
"missing_nodes": [],
226+
"added_nodes": [],
227+
"required_before_violations": [],
228+
"drift_detected": false
229+
}
230+
},
231+
{
232+
"fixture_id": "incident_response_page_triage_moderate_v1",
233+
"degradation_level": "moderate",
234+
"expected_admissible": false,
235+
"expected_failure_labels": [
236+
"RECOVERY_PATH_INVALID"
237+
],
238+
"tool_ordering": {
239+
"original_edge_count": 0,
240+
"replay_edge_count": 0,
241+
"missing_edges": [],
242+
"added_edges": [],
243+
"original_node_count": 0,
244+
"replay_node_count": 0,
245+
"missing_nodes": [],
246+
"added_nodes": [],
247+
"required_before_violations": [],
248+
"drift_detected": false
249+
}
250+
},
251+
{
252+
"fixture_id": "incident_response_page_triage_v1",
253+
"degradation_level": "baseline",
254+
"expected_admissible": true,
255+
"expected_failure_labels": [],
256+
"tool_ordering": {
257+
"original_edge_count": 0,
258+
"replay_edge_count": 0,
259+
"missing_edges": [],
260+
"added_edges": [],
261+
"original_node_count": 0,
262+
"replay_node_count": 0,
263+
"missing_nodes": [],
264+
"added_nodes": [],
265+
"required_before_violations": [],
266+
"drift_detected": false
267+
}
268+
}
269+
]
270+
},
271+
{
272+
"family": "mcp_trace_replay",
273+
"fixtures": [
274+
{
275+
"fixture_id": "mcp_trace_replay_degraded_v1",
276+
"degradation_level": "severe",
277+
"expected_admissible": false,
278+
"expected_failure_labels": [
279+
"CAUSAL_DEPENDENCY_LOSS",
280+
"INVARIANT_VIOLATION",
281+
"RECOVERY_PATH_INVALID"
282+
],
283+
"tool_ordering": {
284+
"original_edge_count": 0,
285+
"replay_edge_count": 0,
286+
"missing_edges": [],
287+
"added_edges": [],
288+
"original_node_count": 0,
289+
"replay_node_count": 0,
290+
"missing_nodes": [],
291+
"added_nodes": [],
292+
"required_before_violations": [],
293+
"drift_detected": false
294+
}
295+
},
296+
{
297+
"fixture_id": "mcp_trace_replay_mild_v1",
298+
"degradation_level": "mild",
299+
"expected_admissible": false,
300+
"expected_failure_labels": [
301+
"INVARIANT_VIOLATION",
302+
"RECOVERY_PATH_INVALID"
303+
],
304+
"tool_ordering": {
305+
"original_edge_count": 0,
306+
"replay_edge_count": 0,
307+
"missing_edges": [],
308+
"added_edges": [],
309+
"original_node_count": 0,
310+
"replay_node_count": 0,
311+
"missing_nodes": [],
312+
"added_nodes": [],
313+
"required_before_violations": [],
314+
"drift_detected": false
315+
}
316+
},
317+
{
318+
"fixture_id": "mcp_trace_replay_moderate_v1",
319+
"degradation_level": "moderate",
320+
"expected_admissible": false,
321+
"expected_failure_labels": [
322+
"CAUSAL_DEPENDENCY_LOSS",
323+
"INVARIANT_VIOLATION"
324+
],
325+
"tool_ordering": {
326+
"original_edge_count": 0,
327+
"replay_edge_count": 0,
328+
"missing_edges": [],
329+
"added_edges": [],
330+
"original_node_count": 0,
331+
"replay_node_count": 0,
332+
"missing_nodes": [],
333+
"added_nodes": [],
334+
"required_before_violations": [],
335+
"drift_detected": false
336+
}
337+
},
338+
{
339+
"fixture_id": "mcp_trace_replay_v1",
340+
"degradation_level": "baseline",
341+
"expected_admissible": true,
342+
"expected_failure_labels": [],
343+
"tool_ordering": {
344+
"original_edge_count": 0,
345+
"replay_edge_count": 0,
346+
"missing_edges": [],
347+
"added_edges": [],
348+
"original_node_count": 0,
349+
"replay_node_count": 0,
350+
"missing_nodes": [],
351+
"added_nodes": [],
352+
"required_before_violations": [],
353+
"drift_detected": false
354+
}
355+
}
356+
]
357+
}
358+
],
359+
"global_summary": {
360+
"family_count": 4,
361+
"fixture_count": 16,
362+
"fixtures_with_tool_ordering_data": 0,
363+
"fixtures_with_tool_ordering_drift": 0,
364+
"total_missing_tool_order_edges": 0,
365+
"total_added_tool_order_edges": 0,
366+
"total_required_before_violations": 0,
367+
"deterministic_evaluation": true,
368+
"llm_judges": "none",
369+
"external_apis": "none"
370+
}
371+
}

0 commit comments

Comments
 (0)