Skip to content

Commit 7324eaa

Browse files
weiyiweiyi
authored andcommitted
Make AgentCI diffs easier to debug at the step level
AgentCI could tell users that an episode changed, but the public backlog correctly identified that regression debugging was still too flat and too shallow. This change adds structured step-level diff items, carries them through regression results and JSON output, and exposes field-level step changes in the HTML report so users can see exactly where a candidate run starts to diverge. Constraint: Preserve existing text diff prefixes so ignore rules such as metric:* keep working Rejected: Replace flat diff items entirely with structured output | too disruptive for current CLI and regression consumers Rejected: Limit the enhancement to HTML only | lower value than fixing the core compare pipeline once Confidence: high Scope-risk: narrow Reversibility: clean Directive: Keep future diff enrichments additive and compatibility-aware; downstream tooling may already depend on diff item prefixes Tested: AgentCI unittest suite; direct CLI validation for diff JSON, assert-regression output, and generated HTML report field-level change rendering Not-tested: Very large deeply nested payload diffs across long episodes
1 parent 7acd868 commit 7324eaa

4 files changed

Lines changed: 194 additions & 20 deletions

File tree

projects/agentci/src/agentci/compare.py

Lines changed: 131 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,47 @@
66
from .schema import Episode
77

88

9+
@dataclass
10+
class StepDiffItem:
11+
step_index: int
12+
kind: str
13+
name: str
14+
field_path: str
15+
left: Any
16+
right: Any
17+
change_type: str = "changed"
18+
19+
def to_dict(self) -> dict[str, Any]:
20+
return {
21+
"step_index": self.step_index,
22+
"kind": self.kind,
23+
"name": self.name,
24+
"field_path": self.field_path,
25+
"left": self.left,
26+
"right": self.right,
27+
"change_type": self.change_type,
28+
}
29+
30+
def to_text(self) -> str:
31+
prefix = f"step {self.step_index} [{self.kind}:{self.name}] {self.field_path}"
32+
if self.change_type == "added":
33+
return f"{prefix}: <missing> -> {self.right!r}"
34+
if self.change_type == "removed":
35+
return f"{prefix}: {self.left!r} -> <missing>"
36+
return f"{prefix}: {self.left!r} -> {self.right!r}"
37+
38+
939
@dataclass
1040
class EpisodeDiff:
1141
changed: bool
1242
items: list[str] = field(default_factory=list)
43+
step_items: list[StepDiffItem] = field(default_factory=list)
1344

1445
def to_dict(self) -> dict[str, object]:
1546
return {
1647
"changed": self.changed,
1748
"items": list(self.items),
49+
"step_items": [item.to_dict() for item in self.step_items],
1850
}
1951

2052

@@ -23,32 +55,116 @@ def _append_if_changed(items: list[str], label: str, left: Any, right: Any) -> N
2355
items.append(f"{label}: {left!r} -> {right!r}")
2456

2557

58+
def _append_nested_step_diffs(
59+
step_items: list[StepDiffItem],
60+
*,
61+
step_index: int,
62+
kind: str,
63+
name: str,
64+
field_path: str,
65+
left: Any,
66+
right: Any,
67+
) -> None:
68+
if left == right:
69+
return
70+
if isinstance(left, dict) and isinstance(right, dict):
71+
for key in sorted(set(left) | set(right)):
72+
child_path = f"{field_path}.{key}" if field_path else str(key)
73+
_append_nested_step_diffs(
74+
step_items,
75+
step_index=step_index,
76+
kind=kind,
77+
name=name,
78+
field_path=child_path,
79+
left=left.get(key),
80+
right=right.get(key),
81+
)
82+
return
83+
step_items.append(
84+
StepDiffItem(
85+
step_index=step_index,
86+
kind=kind,
87+
name=name,
88+
field_path=field_path,
89+
left=left,
90+
right=right,
91+
)
92+
)
93+
94+
2695
def compare_episodes(baseline: Episode, candidate: Episode) -> EpisodeDiff:
2796
items: list[str] = []
97+
step_items: list[StepDiffItem] = []
2898

2999
_append_if_changed(items, "model", baseline.model, candidate.model)
30100
_append_if_changed(items, "prompt_version", baseline.prompt_version, candidate.prompt_version)
31101
_append_if_changed(items, "success", baseline.success, candidate.success)
32102
_append_if_changed(items, "final_output", baseline.final_output, candidate.final_output)
33103
_append_if_changed(items, "step_count", len(baseline.steps), len(candidate.steps))
34104

35-
max_steps = min(len(baseline.steps), len(candidate.steps))
105+
max_steps = max(len(baseline.steps), len(candidate.steps))
36106
for index in range(max_steps):
37-
left = baseline.steps[index]
38-
right = candidate.steps[index]
107+
left = baseline.steps[index] if index < len(baseline.steps) else None
108+
right = candidate.steps[index] if index < len(candidate.steps) else None
109+
if left is None and right is not None:
110+
step_items.append(
111+
StepDiffItem(
112+
step_index=index + 1,
113+
kind=right.kind,
114+
name=right.name,
115+
field_path="step",
116+
left=None,
117+
right=right.payload,
118+
change_type="added",
119+
)
120+
)
121+
continue
122+
if left is not None and right is None:
123+
step_items.append(
124+
StepDiffItem(
125+
step_index=index + 1,
126+
kind=left.kind,
127+
name=left.name,
128+
field_path="step",
129+
left=left.payload,
130+
right=None,
131+
change_type="removed",
132+
)
133+
)
134+
continue
135+
assert left is not None and right is not None
39136
if left.kind != right.kind:
40-
items.append(f"step {index + 1} kind: {left.kind!r} -> {right.kind!r}")
137+
step_items.append(
138+
StepDiffItem(
139+
step_index=index + 1,
140+
kind=left.kind,
141+
name=left.name,
142+
field_path="kind",
143+
left=left.kind,
144+
right=right.kind,
145+
)
146+
)
41147
continue
42148
if left.name != right.name:
43-
items.append(f"step {index + 1} name: {left.name!r} -> {right.name!r}")
44-
if left.payload != right.payload:
45-
all_payload_keys = sorted(set(left.payload) | set(right.payload))
46-
for key in all_payload_keys:
47-
if left.payload.get(key) != right.payload.get(key):
48-
items.append(
49-
f"step {index + 1} payload.{key}: "
50-
f"{left.payload.get(key)!r} -> {right.payload.get(key)!r}"
51-
)
149+
step_items.append(
150+
StepDiffItem(
151+
step_index=index + 1,
152+
kind=left.kind,
153+
name=left.name,
154+
field_path="name",
155+
left=left.name,
156+
right=right.name,
157+
)
158+
)
159+
_append_nested_step_diffs(
160+
step_items,
161+
step_index=index + 1,
162+
kind=left.kind,
163+
name=left.name,
164+
field_path="payload",
165+
left=left.payload,
166+
right=right.payload,
167+
)
52168

53169
all_metric_keys = sorted(set(baseline.metrics) | set(candidate.metrics))
54170
for key in all_metric_keys:
@@ -59,4 +175,5 @@ def compare_episodes(baseline: Episode, candidate: Episode) -> EpisodeDiff:
59175
candidate.metrics.get(key),
60176
)
61177

62-
return EpisodeDiff(changed=bool(items), items=items)
178+
items.extend(item.to_text() for item in step_items)
179+
return EpisodeDiff(changed=bool(items), items=items, step_items=step_items)

projects/agentci/src/agentci/html_report.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,43 @@
44
import json
55
from pathlib import Path
66

7-
from .compare import compare_episodes
7+
from .compare import StepDiffItem, compare_episodes
88
from .schema import Episode, EpisodeStep
99

1010

1111
def _pretty_json(value: object) -> str:
1212
return json.dumps(value, indent=2, sort_keys=True, ensure_ascii=False)
1313

1414

15-
def _step_rows(baseline: Episode, candidate: Episode) -> str:
15+
def _step_rows(baseline: Episode, candidate: Episode, step_items: list[StepDiffItem]) -> str:
1616
rows: list[str] = []
17+
grouped: dict[int, list[StepDiffItem]] = {}
18+
for item in step_items:
19+
grouped.setdefault(item.step_index, []).append(item)
1720
max_steps = max(len(baseline.steps), len(candidate.steps))
1821
for index in range(max_steps):
1922
left = baseline.steps[index] if index < len(baseline.steps) else None
2023
right = candidate.steps[index] if index < len(candidate.steps) else None
21-
changed = left != right
24+
item_group = grouped.get(index + 1, [])
25+
changed = left != right or bool(item_group)
2226
rows.append(
2327
"<tr>"
2428
f"<td>{index + 1}</td>"
2529
f"<td>{_render_step(left)}</td>"
2630
f"<td>{_render_step(right)}</td>"
27-
f"<td><span class='badge {'changed' if changed else 'same'}'>{'changed' if changed else 'same'}</span></td>"
31+
f"<td><span class='badge {'changed' if changed else 'same'}'>{'changed' if changed else 'same'}</span>{_render_step_diff_items(item_group)}</td>"
2832
"</tr>"
2933
)
3034
return "\n".join(rows)
3135

3236

37+
def _render_step_diff_items(items: list[StepDiffItem]) -> str:
38+
if not items:
39+
return ""
40+
details = "".join(f"<li><code>{escape(item.to_text())}</code></li>" for item in items)
41+
return f"<details><summary>field-level changes</summary><ul>{details}</ul></details>"
42+
43+
3344
def _render_step(step: EpisodeStep | None) -> str:
3445
if step is None:
3546
return "<span class='muted'>missing</span>"
@@ -241,7 +252,7 @@ def render_diff_html_report(baseline: Episode, candidate: Episode) -> str:
241252
</tr>
242253
</thead>
243254
<tbody>
244-
{_step_rows(baseline, candidate)}
255+
{_step_rows(baseline, candidate, diff.step_items)}
245256
</tbody>
246257
</table>
247258
</section>

projects/agentci/src/agentci/regression.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from dataclasses import dataclass, field
44
from pathlib import Path
55

6-
from .compare import compare_episodes
6+
from .compare import StepDiffItem, compare_episodes
77
from .replay import replay_episode
88
from .schema import Episode
99

@@ -14,6 +14,7 @@ class RegressionResult:
1414
baseline_path: str | None = None
1515
candidate_path: str | None = None
1616
diff_items: list[str] = field(default_factory=list)
17+
step_items: list[StepDiffItem] = field(default_factory=list)
1718
replay_mismatches: list[str] = field(default_factory=list)
1819

1920
def to_dict(self) -> dict[str, object]:
@@ -22,6 +23,7 @@ def to_dict(self) -> dict[str, object]:
2223
"baseline_path": self.baseline_path,
2324
"candidate_path": self.candidate_path,
2425
"diff_items": list(self.diff_items),
26+
"step_items": [item.to_dict() for item in self.step_items],
2527
"replay_mismatches": list(self.replay_mismatches),
2628
}
2729

@@ -33,6 +35,9 @@ def failure_message(self) -> str:
3335
if self.diff_items:
3436
lines.append("- diff:")
3537
lines.extend(f" - {item}" for item in self.diff_items)
38+
if self.step_items:
39+
lines.append("- step details:")
40+
lines.extend(f" - {item.to_text()}" for item in self.step_items)
3641
if self.replay_mismatches:
3742
lines.append("- replay mismatches:")
3843
lines.extend(f" - {item}" for item in self.replay_mismatches)
@@ -58,6 +63,7 @@ def run_regression_check(
5863
) -> RegressionResult:
5964
diff = compare_episodes(baseline, candidate)
6065
kept_items = [item for item in diff.items if _keep_item(item, ignore_diff_prefixes)]
66+
kept_step_items = [item for item in diff.step_items if _keep_item(item.to_text(), ignore_diff_prefixes)]
6167
replay_mismatches: list[str] = []
6268
if check_candidate_replay:
6369
replay = replay_episode(candidate, strict=True)
@@ -68,6 +74,7 @@ def run_regression_check(
6874
baseline_path=baseline_path,
6975
candidate_path=candidate_path,
7076
diff_items=kept_items,
77+
step_items=kept_step_items,
7178
replay_mismatches=replay_mismatches,
7279
)
7380

projects/agentci/tests/test_agentci.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,45 @@ def test_summarize_cli_can_emit_json(self):
178178
self.assertEqual(payload["tool_calls"], 1)
179179
self.assertEqual(payload["model_calls"], 1)
180180

181+
182+
def test_diff_json_includes_step_items_for_nested_payload_changes(self):
183+
baseline = self._build_episode()
184+
candidate = self._build_episode()
185+
candidate.steps[1].payload["output"] = {"ok": False, "reason": "timeout"}
186+
with tempfile.TemporaryDirectory() as tmpdir:
187+
root = Path(tmpdir)
188+
baseline_path = root / "baseline.json"
189+
candidate_path = root / "candidate.json"
190+
baseline.save(baseline_path)
191+
candidate.save(candidate_path)
192+
output = StringIO()
193+
with redirect_stdout(output):
194+
code = cli_main(["diff", str(baseline_path), str(candidate_path), "--json"])
195+
payload = json.loads(output.getvalue())
196+
self.assertEqual(code, 0)
197+
self.assertTrue(payload["changed"])
198+
self.assertTrue(payload["step_items"])
199+
self.assertEqual(payload["step_items"][0]["step_index"], 2)
200+
self.assertIn("payload.output", payload["step_items"][0]["field_path"])
201+
202+
def test_assert_regression_failure_message_includes_step_details(self):
203+
baseline = self._build_episode()
204+
candidate = self._build_episode()
205+
candidate.steps[1].payload["status"] = "error"
206+
result = run_regression_check(baseline, candidate, check_candidate_replay=False)
207+
self.assertFalse(result.passed)
208+
message = result.failure_message()
209+
self.assertIn("step details:", message)
210+
self.assertIn("step 2 [tool_call:tool] payload.status", message)
211+
212+
def test_html_report_renders_field_level_step_changes(self):
213+
baseline = self._build_episode()
214+
candidate = self._build_episode()
215+
candidate.steps.append(type(baseline.steps[0])(kind="note", name="extra", payload={"x": 1}))
216+
html = render_diff_html_report(baseline, candidate)
217+
self.assertIn("field-level changes", html)
218+
self.assertIn("step 3 [note:extra] step", html)
219+
181220
def test_diff_cli_can_emit_json(self):
182221
baseline = self._build_episode()
183222
candidate = self._build_episode()

0 commit comments

Comments
 (0)